summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/sha256_mb
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto/isa-l/isa-l_crypto/sha256_mb
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/sha256_mb')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am127
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c59
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S238
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S289
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S342
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S380
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c268
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c268
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c273
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c283
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c301
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c262
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c146
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm274
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm288
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm295
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm261
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm260
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm261
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm261
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm301
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c160
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c203
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c300
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c241
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c129
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c132
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm930
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm431
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm426
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm620
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm125
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm361
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm574
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm567
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c204
49 files changed, 12711 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
new file mode 100644
index 000000000..9405c2469
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
@@ -0,0 +1,127 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sha256_mb/sha256_ctx_sse.c \
+ sha256_mb/sha256_ctx_avx.c \
+ sha256_mb/sha256_ctx_avx2.c \
+ sha256_mb/sha256_ctx_base.c
+
+lsrc_x86_64 += sha256_mb/sha256_mb_mgr_init_sse.c \
+ sha256_mb/sha256_mb_mgr_init_avx2.c
+
+
+lsrc_x86_64 += sha256_mb/sha256_mb_mgr_submit_sse.asm \
+ sha256_mb/sha256_mb_mgr_submit_avx.asm \
+ sha256_mb/sha256_mb_mgr_submit_avx2.asm \
+ sha256_mb/sha256_mb_mgr_flush_sse.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx2.asm \
+ sha256_mb/sha256_mb_x4_sse.asm \
+ sha256_mb/sha256_mb_x4_avx.asm \
+ sha256_mb/sha256_mb_x8_avx2.asm \
+ sha256_mb/sha256_multibinary.asm
+
+lsrc_x86_64 += sha256_mb/sha256_ctx_avx512.c \
+ sha256_mb/sha256_mb_mgr_init_avx512.c \
+ sha256_mb/sha256_mb_mgr_submit_avx512.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx512.asm \
+ sha256_mb/sha256_mb_x16_avx512.asm
+
+lsrc_x86_64 += sha256_mb/sha256_opt_x1.asm
+
+lsrc_x86_64 += sha256_mb/sha256_ni_x1.asm \
+ sha256_mb/sha256_ni_x2.asm \
+ sha256_mb/sha256_ctx_sse_ni.c \
+ sha256_mb/sha256_ctx_avx512_ni.c \
+ sha256_mb/sha256_mb_mgr_submit_sse_ni.asm \
+ sha256_mb/sha256_mb_mgr_flush_sse_ni.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += sha256_mb/sha256_ctx_base.c \
+ sha256_mb/sha256_ref.c
+
+lsrc_aarch64 += sha256_mb/aarch64/sha256_mb_multibinary.S \
+ sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c \
+ sha256_mb/aarch64/sha256_ctx_ce.c \
+ sha256_mb/aarch64/sha256_mb_mgr_ce.c \
+ sha256_mb/aarch64/sha256_mb_x1_ce.S \
+ sha256_mb/aarch64/sha256_mb_x2_ce.S \
+ sha256_mb/aarch64/sha256_mb_x3_ce.S \
+ sha256_mb/aarch64/sha256_mb_x4_ce.S
+
+
+lsrc_base_aliases += sha256_mb/sha256_ctx_base_aliases.c \
+ sha256_mb/sha256_ctx_base.c \
+ sha256_mb/sha256_ref.c
+
+src_include += -I $(srcdir)/sha256_mb
+
+extern_hdrs += include/sha256_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ sha256_mb/sha256_job.asm \
+ sha256_mb/sha256_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha256_mb/sha256_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha256_mb/sha256_mb_test \
+ sha256_mb/sha256_mb_rand_test \
+ sha256_mb/sha256_mb_rand_update_test \
+ sha256_mb/sha256_mb_flush_test
+
+unit_tests += sha256_mb/sha256_mb_rand_ssl_test
+
+perf_tests += sha256_mb/sha256_mb_vs_ossl_perf \
+ sha256_mb/sha256_mb_vs_ossl_shortage_perf
+
+sha256_mb_rand_ssl_test: sha256_ref.o
+sha256_mb_rand_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_update_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_update_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_flush_test: sha256_ref.o
+sha256_mb_sha256_mb_flush_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
new file mode 100644
index 000000000..4776f55bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state);
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job);
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_ce(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_ce(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_ce_slver_02020142;
+struct slver sha256_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha256_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha256_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..8627991c3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_submit)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(sha256_ctx_mgr_submit_ce);
+
+ return PROVIDER_BASIC(sha256_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(sha256_ctx_mgr_init_ce);
+
+ return PROVIDER_BASIC(sha256_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(sha256_ctx_mgr_flush_ce);
+
+ return PROVIDER_BASIC(sha256_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
new file mode 100644
index 000000000..aa63c4dd8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha256_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA256_MB_CE_MAX_LANES 3
+
+#if SHA256_MB_CE_MAX_LANES >=4
+void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+#endif
+void sha256_mb_ce_x1(SHA256_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state)
+{
+ int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = SHA256_MB_CE_MAX_LANES - 1; i >= 0; i--) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+ for (i = SHA256_MB_CE_MAX_LANES; i < SHA256_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sha256_mb_mgr_do_jobs(SHA256_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes;
+
+ int lane_idx_array[SHA256_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+#if SHA256_MB_CE_MAX_LANES == 4
+ if (state->num_lanes_inuse == 4) {
+ len = min(min(state->lens[0], state->lens[1]),
+ min(state->lens[2], state->lens[3]));
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha256_mb_ce_x4(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane,
+ state->ldata[3].job_in_lane, len >> 4);
+
+ } else
+#elif SHA256_MB_CE_MAX_LANES == 3
+ if (state->num_lanes_inuse == 3) {
+ len = min(min(state->lens[0], state->lens[1]), state->lens[2]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha256_mb_ce_x3(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane, len >> 4);
+
+ } else
+#elif SHA256_MB_CE_MAX_LANES == 2
+ if (state->num_lanes_inuse == 2) {
+ len = min(state->lens[0], state->lens[1]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha256_mb_ce_x2(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane, len >> 4);
+
+ } else
+#endif
+ {
+ lanes = 0, len = 0;
+ for (i = 0; i < SHA256_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+#if SHA256_MB_CE_MAX_LANES >=4
+ if (lanes == 4) {
+ sha256_mb_ce_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane,
+ state->ldata[lane_idx_array[3]].job_in_lane, len >> 4);
+
+ } else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+ if (lanes == 3) {
+ sha256_mb_ce_x3(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane, len >> 4);
+ } else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+ if (lanes == 2) {
+ sha256_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+ } else
+#endif
+ {
+ sha256_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+ }
+ }
+ //only return the min length job
+ for (i = 0; i < SHA256_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+
+}
+
+static SHA256_JOB *sha256_mb_mgr_free_lane(SHA256_MB_JOB_MGR * state)
+{
+ int i;
+ SHA256_JOB *ret = NULL;
+
+ for (i = 0; i < SHA256_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sha256_mb_mgr_insert_job(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SHA256_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SHA256_JOB *ret;
+
+ //add job into lanes
+ sha256_mb_mgr_insert_job(state, job);
+
+ ret = sha256_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SHA256_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sha256_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sha256_mb_mgr_do_jobs(state);
+#endif
+
+ //~ i = lane_idx;
+ ret = sha256_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state)
+{
+ SHA256_JOB *ret;
+ ret = sha256_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sha256_mb_mgr_do_jobs(state);
+ return sha256_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
new file mode 100644
index 000000000..ecc5fc5f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include <aarch64_multibinary.h>
+
+
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
new file mode 100644
index 000000000..06d0ab5fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
@@ -0,0 +1,238 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+ ldr key_q , [tmp]
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ add tmp,tmp,16
+ add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0,\tmp1
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l0_abcd_saved,5
+ declare_var_vector_reg l0_efgh_saved,6
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,2
+ declare_var_vector_reg l0_tmp1,3
+ declare_var_vector_reg l0_tmp2,4
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+
+
+
+/*
+ void sha256_mb_ce_x1(SHA1_JOB * l0_job, int len);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ len .req w1
+ l0_data .req x2
+ tmp .req x3
+ .global sha256_mb_ce_x1
+ .type sha256_mb_ce_x1, %function
+sha256_mb_ce_x1:
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+
+
+
+start_loop:
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov l0_abcd_saved_v.16b,l0_abcd_v.16b
+ mov l0_efgh_saved_v.16b,l0_efgh_v.16b
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s,l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */
+ sha256_4_rounds_high msg2,tmp1,tmp0
+ sha256_4_rounds_high msg3,tmp0,tmp1
+
+ /* rounds 60-63 */
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+
+
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+
+
+ bgt start_loop
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+
+ ret
+
+ .size sha256_mb_ce_x1, .-sha256_mb_ce_x1
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
new file mode 100644
index 000000000..dadf44bb0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
@@ -0,0 +1,289 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+ ldr key_q , [tmp]
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ add tmp,tmp,16
+ add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+ sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+ sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0,\tmp1
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l0_abcd_saved,2
+ declare_var_vector_reg l0_efgh_saved,3
+ declare_var_vector_reg l1_abcd,4
+ declare_var_vector_reg l1_efgh,5
+ declare_var_vector_reg l1_abcd_saved,6
+ declare_var_vector_reg l1_efgh_saved,7
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,8
+ declare_var_vector_reg l0_tmp1,9
+ declare_var_vector_reg l0_tmp2,10
+ declare_var_vector_reg l1_tmp0,11
+ declare_var_vector_reg l1_tmp1,12
+ declare_var_vector_reg l1_tmp2,13
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l1_msg0,20
+ declare_var_vector_reg l1_msg1,21
+ declare_var_vector_reg l1_msg2,22
+ declare_var_vector_reg l1_msg3,23
+
+
+
+/*
+ void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ l1_job .req x1
+ len .req w2
+ l0_data .req x3
+ l1_data .req x4
+ tmp .req x5
+ .global sha256_mb_ce_x2
+ .type sha256_mb_ce_x2, %function
+sha256_mb_ce_x2:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+ ldr l1_data, [l1_job]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_efgh_q, [l1_job, 80]
+
+
+
+start_loop:
+
+ //load key addr
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov l0_abcd_saved_v.16b,l0_abcd_v.16b
+ mov l0_efgh_saved_v.16b,l0_efgh_v.16b
+ mov l1_abcd_saved_v.16b,l1_abcd_v.16b
+ mov l1_efgh_saved_v.16b,l1_efgh_v.16b
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+ rev32 l1_msg0_v.16b,l1_msg0_v.16b
+ rev32 l1_msg1_v.16b,l1_msg1_v.16b
+ add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+ rev32 l1_msg2_v.16b,l1_msg2_v.16b
+ rev32 l1_msg3_v.16b,l1_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */
+ sha256_4_rounds_high msg2,tmp1,tmp0
+ sha256_4_rounds_high msg3,tmp0,tmp1
+
+ /* rounds 60-63 */
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+
+
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+ add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+ add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+
+
+ bgt start_loop
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+ str l1_abcd_q, [l1_job, 64]
+ str l1_efgh_q, [l1_job, 80]
+
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .size sha256_mb_ce_x2, .-sha256_mb_ce_x2
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
new file mode 100644
index 000000000..6ed1591ba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
@@ -0,0 +1,342 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+ ldr key_q , [tmp]
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ mov l2_tmp2_v.16b,l2_abcd_v.16b
+ add tmp,tmp,16
+ add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+ add l2_\tmp1\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+ sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+ sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+ sha256h l2_abcd_q,l2_efgh_q,l2_\tmp0\()_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+ sha256h2 l2_efgh_q,l2_tmp2_q,l2_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+ sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0,\tmp1
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+ sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l1_abcd,2
+ declare_var_vector_reg l1_efgh,3
+ declare_var_vector_reg l2_abcd,4
+ declare_var_vector_reg l2_efgh,5
+ declare_var_vector_reg l1_abcd_saved,16
+ declare_var_vector_reg l1_efgh_saved,17
+ declare_var_vector_reg l0_abcd_saved,20
+ declare_var_vector_reg l0_efgh_saved,21
+ declare_var_vector_reg l2_abcd_saved,24
+ declare_var_vector_reg l2_efgh_saved,25
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,6
+ declare_var_vector_reg l0_tmp1,7
+ declare_var_vector_reg l0_tmp2,8
+ declare_var_vector_reg l1_tmp0,9
+ declare_var_vector_reg l1_tmp1,10
+ declare_var_vector_reg l1_tmp2,11
+ declare_var_vector_reg l2_tmp0,12
+ declare_var_vector_reg l2_tmp1,13
+ declare_var_vector_reg l2_tmp2,14
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l1_msg0,20
+ declare_var_vector_reg l1_msg1,21
+ declare_var_vector_reg l1_msg2,22
+ declare_var_vector_reg l1_msg3,23
+ declare_var_vector_reg l2_msg0,24
+ declare_var_vector_reg l2_msg1,25
+ declare_var_vector_reg l2_msg2,26
+ declare_var_vector_reg l2_msg3,27
+
+
+
+/*
+ void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ l1_job .req x1
+ l2_job .req x2
+ len .req w3
+ l0_data .req x4
+ l1_data .req x5
+ l2_data .req x6
+ tmp .req x7
+ .global sha256_mb_ce_x3
+ .type sha256_mb_ce_x3, %function
+sha256_mb_ce_x3:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+ ldr l1_data, [l1_job]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_efgh_q, [l1_job, 80]
+ ldr l2_data, [l2_job]
+ ldr l2_abcd_q, [l2_job, 64]
+ ldr l2_efgh_q, [l2_job, 80]
+
+
+
+start_loop:
+
+ //load key addr
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+ ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ add l2_data,l2_data,64
+ sub len, len, #1
+ cmp len, 0
+/*
+ //backup digest
+ mov l0_abcd_saved_v.16b,l0_abcd_v.16b
+ mov l0_efgh_saved_v.16b,l0_efgh_v.16b
+ mov l1_abcd_saved_v.16b,l1_abcd_v.16b
+ mov l1_efgh_saved_v.16b,l1_efgh_v.16b
+ mov l2_abcd_saved_v.16b,l2_abcd_v.16b
+ mov l2_efgh_saved_v.16b,l2_efgh_v.16b
+*/
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+ rev32 l1_msg0_v.16b,l1_msg0_v.16b
+ rev32 l1_msg1_v.16b,l1_msg1_v.16b
+ add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+ rev32 l1_msg2_v.16b,l1_msg2_v.16b
+ rev32 l1_msg3_v.16b,l1_msg3_v.16b
+
+ rev32 l2_msg0_v.16b,l2_msg0_v.16b
+ rev32 l2_msg1_v.16b,l2_msg1_v.16b
+ add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+ rev32 l2_msg2_v.16b,l2_msg2_v.16b
+ rev32 l2_msg3_v.16b,l2_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+
+
+ sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */
+
+ /* msg0 msg1 is free , share with digest regs */
+ ldr l0_abcd_saved_q, [l0_job, 64]
+ ldr l1_abcd_saved_q, [l1_job, 64]
+ ldr l2_abcd_saved_q, [l2_job, 64]
+ ldr l0_efgh_saved_q, [l0_job, 80]
+ ldr l1_efgh_saved_q, [l1_job, 80]
+ ldr l2_efgh_saved_q, [l2_job, 80]
+
+ sha256_4_rounds_high msg2,tmp1,tmp0
+ sha256_4_rounds_high msg3,tmp0,tmp1
+
+ /* rounds 60-63 */
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+ mov l2_tmp2_v.16b,l2_abcd_v.16b
+ sha256h l2_abcd_q,l2_efgh_q,l2_tmp1_v.4s
+ sha256h2 l2_efgh_q,l2_tmp2_q,l2_tmp1_v.4s
+
+ /* combine state */
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+ add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+ add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+ add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+ add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+ str l1_abcd_q, [l1_job, 64]
+ str l1_efgh_q, [l1_job, 80]
+ str l2_abcd_q, [l2_job, 64]
+ str l2_efgh_q, [l2_job, 80]
+
+ bgt start_loop
+
+
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .size sha256_mb_ce_x3, .-sha256_mb_ce_x3
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
new file mode 100644
index 000000000..b1686ada1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
@@ -0,0 +1,380 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+tmp0 : in
+tmp1 : out
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req
+ ldr key_q , [tmp]
+ mov tmp0_v.16b,l0_\tmp0\()_v.16b
+ mov tmp1_v.16b,l1_\tmp0\()_v.16b
+ add l0_\tmp0\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ add l1_\tmp0\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+ mov tmp2_v.16b,l0_abcd_v.16b
+ mov tmp3_v.16b,l1_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,tmp0_v.4s
+ sha256h l1_abcd_q,l1_efgh_q,tmp1_v.4s
+ sha256h2 l0_efgh_q,tmp2_q,tmp0_v.4s
+ sha256h2 l1_efgh_q,tmp3_q,tmp1_v.4s
+
+ ldr key_q , [tmp]
+ mov tmp0_v.16b,l2_\tmp0\()_v.16b
+ mov tmp1_v.16b,l3_\tmp0\()_v.16b
+ add tmp,tmp,16
+ add l2_\tmp0\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+ add l3_\tmp0\()_v.4s,l3_\msg\()_v.4s,key_v.4s
+ mov tmp2_v.16b,l2_abcd_v.16b
+ mov tmp3_v.16b,l3_abcd_v.16b
+ sha256h l2_abcd_q,l2_efgh_q,tmp0_v.4s
+ sha256h l3_abcd_q,l3_efgh_q,tmp1_v.4s
+ sha256h2 l2_efgh_q,tmp2_q,tmp0_v.4s
+ sha256h2 l3_efgh_q,tmp3_q,tmp1_v.4s
+
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+ sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+ sha256su0 l3_\msg0\()_v.4s,l3_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+ sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+ sha256su1 l3_\msg0\()_v.4s,l3_\msg2\()_v.4s,l3_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,15
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l1_abcd,2
+ declare_var_vector_reg l1_efgh,3
+ declare_var_vector_reg l2_abcd,4
+ declare_var_vector_reg l2_efgh,5
+ declare_var_vector_reg l3_abcd,6
+ declare_var_vector_reg l3_efgh,7
+ declare_var_vector_reg l1_abcd_saved,16
+ declare_var_vector_reg l1_efgh_saved,17
+ declare_var_vector_reg l0_abcd_saved,20
+ declare_var_vector_reg l0_efgh_saved,21
+ declare_var_vector_reg l2_abcd_saved,24
+ declare_var_vector_reg l2_efgh_saved,25
+ declare_var_vector_reg l3_abcd_saved,28
+ declare_var_vector_reg l3_efgh_saved,29
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,8
+ declare_var_vector_reg l1_tmp0,9
+ declare_var_vector_reg l2_tmp0,10
+ declare_var_vector_reg l3_tmp0,11
+
+ declare_var_vector_reg tmp0,12
+ declare_var_vector_reg tmp1,13
+ declare_var_vector_reg tmp2,14
+ declare_var_vector_reg tmp3,15
+
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l1_msg0,20
+ declare_var_vector_reg l1_msg1,21
+ declare_var_vector_reg l1_msg2,22
+ declare_var_vector_reg l1_msg3,23
+ declare_var_vector_reg l2_msg0,24
+ declare_var_vector_reg l2_msg1,25
+ declare_var_vector_reg l2_msg2,26
+ declare_var_vector_reg l2_msg3,27
+ declare_var_vector_reg l3_msg0,28
+ declare_var_vector_reg l3_msg1,29
+ declare_var_vector_reg l3_msg2,30
+ declare_var_vector_reg l3_msg3,31
+
+
+
+/*
+ void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ l1_job .req x1
+ l2_job .req x2
+ l3_job .req x3
+ len .req w4
+ l0_data .req x5
+ l1_data .req x6
+ l2_data .req x7
+ l3_data .req x8
+ tmp .req x9
+ .global sha256_mb_ce_x4
+ .type sha256_mb_ce_x4, %function
+sha256_mb_ce_x4:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+ ldr l1_data, [l1_job]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_efgh_q, [l1_job, 80]
+ ldr l2_data, [l2_job]
+ ldr l2_abcd_q, [l2_job, 64]
+ ldr l2_efgh_q, [l2_job, 80]
+ ldr l3_data, [l3_job]
+ ldr l3_abcd_q, [l3_job, 64]
+ ldr l3_efgh_q, [l3_job, 80]
+
+
+
+start_loop:
+
+ //load key addr
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+ ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+ ld1 {l3_msg0_v.4s-l3_msg3_v.4s},[l3_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ add l2_data,l2_data,64
+ add l3_data,l3_data,64
+ sub len, len, #1
+ cmp len, 0
+
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+ rev32 l1_msg0_v.16b,l1_msg0_v.16b
+ rev32 l1_msg1_v.16b,l1_msg1_v.16b
+ add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+ rev32 l1_msg2_v.16b,l1_msg2_v.16b
+ rev32 l1_msg3_v.16b,l1_msg3_v.16b
+
+ rev32 l2_msg0_v.16b,l2_msg0_v.16b
+ rev32 l2_msg1_v.16b,l2_msg1_v.16b
+ add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+ rev32 l2_msg2_v.16b,l2_msg2_v.16b
+ rev32 l2_msg3_v.16b,l2_msg3_v.16b
+
+ rev32 l3_msg0_v.16b,l3_msg0_v.16b
+ rev32 l3_msg1_v.16b,l3_msg1_v.16b
+ add l3_tmp0_v.4s, l3_msg0_v.4s,key_v.4s
+ rev32 l3_msg2_v.16b,l3_msg2_v.16b
+ rev32 l3_msg3_v.16b,l3_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0
+
+
+
+ sha256_4_rounds_high msg1,tmp0 /* rounds 48-51 */
+
+ /* msg0 msg1 is free , share with digest regs */
+ ldr l0_abcd_saved_q, [l0_job, 64]
+ ldr l1_abcd_saved_q, [l1_job, 64]
+ ldr l2_abcd_saved_q, [l2_job, 64]
+ ldr l3_abcd_saved_q, [l3_job, 64]
+ ldr l0_efgh_saved_q, [l0_job, 80]
+ ldr l1_efgh_saved_q, [l1_job, 80]
+ ldr l2_efgh_saved_q, [l2_job, 80]
+ ldr l3_efgh_saved_q, [l3_job, 80]
+
+ sha256_4_rounds_high msg2,tmp0
+ sha256_4_rounds_high msg3,tmp0
+
+ /* rounds 60-63 */
+ mov tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp0_v.4s
+ sha256h2 l0_efgh_q,tmp2_q,l0_tmp0_v.4s
+
+ mov tmp2_v.16b,l1_abcd_v.16b
+ sha256h l1_abcd_q,l1_efgh_q,l1_tmp0_v.4s
+ sha256h2 l1_efgh_q,tmp2_q,l1_tmp0_v.4s
+
+ mov tmp2_v.16b,l2_abcd_v.16b
+ sha256h l2_abcd_q,l2_efgh_q,l2_tmp0_v.4s
+ sha256h2 l2_efgh_q,tmp2_q,l2_tmp0_v.4s
+
+ mov tmp2_v.16b,l3_abcd_v.16b
+ sha256h l3_abcd_q,l3_efgh_q,l3_tmp0_v.4s
+ sha256h2 l3_efgh_q,tmp2_q,l3_tmp0_v.4s
+
+ /* combine state */
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+ add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+ add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+ add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+ add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+ add l3_abcd_v.4s,l3_abcd_v.4s,l3_abcd_saved_v.4s
+ add l3_efgh_v.4s,l3_efgh_v.4s,l3_efgh_saved_v.4s
+
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+ str l1_abcd_q, [l1_job, 64]
+ str l1_efgh_q, [l1_job, 80]
+ str l2_abcd_q, [l2_job, 64]
+ str l2_efgh_q, [l2_job, 80]
+ str l3_abcd_q, [l3_job, 64]
+ str l3_efgh_q, [l3_job, 80]
+
+ bgt start_loop
+
+
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .size sha256_mb_ce_x4, .-sha256_mb_ce_x4
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
new file mode 100644
index 000000000..12441a8e3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx_slver_02020154;
+struct slver sha256_ctx_mgr_init_avx_slver = { 0x0154, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_avx_slver_02020155;
+struct slver sha256_ctx_mgr_submit_avx_slver = { 0x0155, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_avx_slver_02020156;
+struct slver sha256_ctx_mgr_flush_avx_slver = { 0x0156, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
new file mode 100644
index 000000000..9c045659e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx2(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx2_slver_04020157;
+struct slver sha256_ctx_mgr_init_avx2_slver = { 0x0157, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_submit_avx2_slver_04020158;
+struct slver sha256_ctx_mgr_submit_avx2_slver = { 0x0158, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_flush_avx2_slver_04020159;
+struct slver sha256_ctx_mgr_flush_avx2_slver = { 0x0159, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
new file mode 100644
index 000000000..a1f068987
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
@@ -0,0 +1,273 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_slver_0600015a;
+struct slver sha256_ctx_mgr_init_avx512_slver = { 0x015a, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_submit_avx512_slver_0600015b;
+struct slver sha256_ctx_mgr_submit_avx512_slver = { 0x015b, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_flush_avx512_slver_0600015c;
+struct slver sha256_ctx_mgr_flush_avx512_slver = { 0x015c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
new file mode 100644
index 000000000..763057f12
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
@@ -0,0 +1,283 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+/**
+ * sha256_ctx_avx512_ni related functions are aiming to utilize Canon Lake.
+ * Since SHANI is still slower than multibuffer for full lanes,
+ * sha256_ctx_mgr_init_avx512_ni and sha256_ctx_mgr_submit_avx512_ni are
+ * similare with their avx512 versions.
+ * sha256_ctx_mgr_flush_avx512_ni is different. It will call
+ * sha256_mb_mgr_flush_avx512_ni which would use shani when lanes are less
+ * than a threshold.
+ *
+ */
+#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI)
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512_ni(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx, const void *buffer,
+ uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_ni_slver_080002ca;
+struct slver sha256_ctx_mgr_init_avx512_ni_slver = { 0x02ca, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver_080002cb;
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver = { 0x02cb, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver_080002cc;
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver = { 0x02cc, 0x00, 0x08 };
+
+#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
new file mode 100644
index 000000000..58bf024a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
@@ -0,0 +1,301 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sha256_single(const void *data, uint32_t digest[]);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+
+void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+
+ sha256_init(ctx, buffer, len);
+ sha256_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ sha256_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = sha256_update(ctx, buffer, len);
+ sha256_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ sha256_init(ctx, buffer, len);
+ remain_len = sha256_update(ctx, buffer, len);
+ sha256_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint32_t *digest = ctx->job.result_digest;
+
+ while (remain_len >= SHA256_BLOCK_SIZE) {
+ sha256_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + SHA256_BLOCK_SIZE);
+ remain_len -= SHA256_BLOCK_SIZE;
+ ctx->total_length += SHA256_BLOCK_SIZE;
+ }
+ ctx->status = HASH_CTX_STS_IDLE;
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+ uint32_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy(buf, buffer, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+ sha256_single(buf, digest);
+ if (i == 2 * SHA256_BLOCK_SIZE) {
+ sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha256_single(const void *data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_base_slver_000002f0;
+struct slver sha256_ctx_mgr_init_base_slver = { 0x02f0, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_base_slver_000002f1;
+struct slver sha256_ctx_mgr_submit_base_slver = { 0x02f1, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_base_slver_000002f2;
+struct slver sha256_ctx_mgr_flush_base_slver = { 0x02f2, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
new file mode 100644
index 000000000..1483f631c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx, const void *buffer,
+ uint32_t len, HASH_CTX_FLAG flags);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr);
+
+void sha256_ctx_mgr_init(SHA256_HASH_CTX_MGR * mgr)
+{
+ return sha256_ctx_mgr_init_base(mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return sha256_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush(SHA256_HASH_CTX_MGR * mgr)
+{
+ return sha256_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
new file mode 100644
index 000000000..f85f5c88b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_slver_00020151;
+struct slver sha256_ctx_mgr_init_sse_slver = { 0x0151, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_sse_slver_00020152;
+struct slver sha256_ctx_mgr_submit_sse_slver = { 0x0152, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_sse_slver_00020153;
+struct slver sha256_ctx_mgr_flush_sse_slver = { 0x0153, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
new file mode 100644
index 000000000..e2c7e2738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
@@ -0,0 +1,262 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_SHANI
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ // Same with sse
+ sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse_ni(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_ni_slver_070002c7;
+struct slver sha256_ctx_mgr_init_sse_ni_slver = { 0x02c7, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_submit_sse_ni_slver_070002c8;
+struct slver sha256_ctx_mgr_submit_sse_ni_slver = { 0x02c8, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_flush_sse_ni_slver_070002c9;
+struct slver sha256_ctx_mgr_flush_sse_ni_slver = { 0x02c9, 0x00, 0x07 };
+
+#endif // HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
new file mode 100644
index 000000000..f9fb6d230
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
@@ -0,0 +1,65 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SHA256_SB_THRESHOLD_SSE 1
+%define SHA256_SB_THRESHOLD_AVX 1
+%define SHA256_SB_THRESHOLD_AVX2 1
+%define SHA256_SB_THRESHOLD_AVX512 1
+%define SHA256_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb
+%define SHA256_NI_SB_THRESHOLD_AVX512 6
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 8, 8 ; length in bytes
+FIELD _result_digest, 8*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+
+%assign _SHA256_JOB_size _FIELD_OFFSET
+%assign _SHA256_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
new file mode 100644
index 000000000..28f1f5118
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
@@ -0,0 +1,146 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS (SHA256_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SHA256_HASH_CTX_MGR * mgr)
+{
+ static int32_t last_lens[SHA256_MAX_LANES] = { 0 };
+ int32_t len;
+ uint8_t num_unchanged = 0;
+ int i;
+ for (i = 0; i < SHA256_MAX_LANES; i++) {
+ len = (int32_t) mgr->mgr.lens[i];
+ // len[i] in mgr consists of byte_length<<4 | lane_index
+ len = (len >= 16) ? (len >> 4 << 6) : 0;
+ printf("\t%d", len);
+ if (last_lens[i] > 0 && last_lens[i] == len)
+ num_unchanged += 1;
+ last_lens[i] = len;
+ }
+ printf("\n");
+ return num_unchanged;
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ uint8_t num_ret, num_unchanged = 0;
+ int ret;
+
+ printf("sha256_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ lens[i] = TEST_LEN / SHA256_MAX_LANES * (i + 1);
+ bufs[i] = (unsigned char *)malloc(lens[i]);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], lens[i]);
+ }
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ printf("Changes of lens inside mgr:\n");
+ lens_print_and_check(mgr);
+ while (sha256_ctx_mgr_flush(mgr)) {
+ num_ret = lens_print_and_check(mgr);
+ num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+ }
+ printf("Info of sha256_mb lens prints over\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else if (num_unchanged)
+ printf("SHA-NI is used when %d or %d jobs are uncompleted\n",
+ num_unchanged, num_unchanged + 1);
+ else
+ printf("SHA-NI is not used, or used for last job\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..ebba9ca36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_ARGS_X16
+;;; name size align
+FIELD _digest, 4*8*16, 4 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA256_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA256_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X8_align _STRUCT_ALIGN
+%assign _SHA256_ARGS_X16_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..69f27f42d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
@@ -0,0 +1,253 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx, function
+sha256_mb_mgr_flush_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..0ee0589cf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
@@ -0,0 +1,274 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha256_mb_x8_avx2 and sha256_opt_x1
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx2(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx2, function
+sha256_mb_mgr_flush_avx2:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX2
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..201cd42b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
@@ -0,0 +1,288 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512, function
+sha256_mb_mgr_flush_avx512:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_flush_avx512
+no_sha256_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
new file mode 100644
index 000000000..7bc9d32a4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
@@ -0,0 +1,295 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+
+extern sha256_mb_x16_avx512
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512_ni, function
+sha256_mb_mgr_flush_avx512_ni:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+ cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+ %else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_avx512_ni
+ no_sha256_mb_mgr_flush_avx512_ni:
+ %endif
+ %endif ; HAVE_AS_KNOWS_SHANI
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_avx512_ni
+ no_sha256_mb_mgr_flush_avx512_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..69ae4bad5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_sse
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse, function
+sha256_mb_mgr_flush_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
new file mode 100644
index 000000000..43b8fcbe4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha256_mb_x4_sse
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse_ni, function
+sha256_mb_mgr_flush_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+ cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_sse_ni
+ no_sha256_mb_mgr_flush_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..903fb733b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx2(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA256_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..b875735f9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx512(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA256_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
new file mode 100644
index 000000000..cf22c4aee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF3210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA256_MIN_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..cb7d5790a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
@@ -0,0 +1,260 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*5
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx, function
+sha256_mb_mgr_submit_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..af2fc89ea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx2(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx2, function
+sha256_mb_mgr_submit_avx2:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..cdc477370
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+%define num_lanes_inuse r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx512(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx512, function
+sha256_mb_mgr_submit_avx512:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_submit_avx512
+no_sha256_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..b1bbc7002
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*5
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse, function
+sha256_mb_mgr_submit_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ movdqa [rsp + 16*0], xmm6
+ movdqa [rsp + 16*1], xmm7
+ movdqa [rsp + 16*2], xmm8
+ movdqa [rsp + 16*3], xmm9
+ movdqa [rsp + 16*4], xmm10
+ movdqa [rsp + 16*5], xmm11
+ movdqa [rsp + 16*6], xmm12
+ movdqa [rsp + 16*7], xmm13
+ movdqa [rsp + 16*8], xmm14
+ movdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ movdqa xmm1, [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ movd [state + _args_digest + 4*lane + 4*16], xmm1
+ pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 16*0]
+ movdqa xmm7, [rsp + 16*1]
+ movdqa xmm8, [rsp + 16*2]
+ movdqa xmm9, [rsp + 16*3]
+ movdqa xmm10, [rsp + 16*4]
+ movdqa xmm11, [rsp + 16*5]
+ movdqa xmm12, [rsp + 16*6]
+ movdqa xmm13, [rsp + 16*7]
+ movdqa xmm14, [rsp + 16*8]
+ movdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
new file mode 100644
index 000000000..cb1dce641
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
@@ -0,0 +1,301 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha256_mb_x4_sse
+extern sha256_ni_x2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*7
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse_ni(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse_ni, function
+sha256_mb_mgr_submit_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+ mov [rsp + _XMM_SAVE + 8*5], r13
+ mov [rsp + _XMM_SAVE + 8*6], r14
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ movdqa [rsp + 16*0], xmm6
+ movdqa [rsp + 16*1], xmm7
+ movdqa [rsp + 16*2], xmm8
+ movdqa [rsp + 16*3], xmm9
+ movdqa [rsp + 16*4], xmm10
+ movdqa [rsp + 16*5], xmm11
+ movdqa [rsp + 16*6], xmm12
+ movdqa [rsp + 16*7], xmm13
+ movdqa [rsp + 16*8], xmm14
+ movdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ movdqa xmm1, [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ movd [state + _args_digest + 4*lane + 4*16], xmm1
+ pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+
+ cmp unused_lanes, 0xF32 ; we will process two jobs at the same time
+ jne return_null ; wait for another sha_ni job
+
+ ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
+ %if SHA256_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb
+ ; shani glue code
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+ ; lensN-len2=idx
+ sub lens0, len2
+ sub lens1, len2
+
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_ni_x2
+ ; state and idx are intact
+ %else
+ ; original mb code
+ cmp unused_lanes, 0xF
+ jne return_null
+
+ start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+ %endif
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 16*0]
+ movdqa xmm7, [rsp + 16*1]
+ movdqa xmm8, [rsp + 16*2]
+ movdqa xmm9, [rsp + 16*3]
+ movdqa xmm10, [rsp + 16*4]
+ movdqa xmm11, [rsp + 16*5]
+ movdqa xmm12, [rsp + 16*6]
+ movdqa xmm13, [rsp + 16*7]
+ movdqa xmm14, [rsp + 16*8]
+ movdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov r13, [rsp + _XMM_SAVE + 8*5]
+ mov r14, [rsp + _XMM_SAVE + 8*6]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_submit_sse_ni
+ no_sha256_mb_mgr_submit_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
new file mode 100644
index 000000000..768bfca78
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA256(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
new file mode 100644
index 000000000..adba77f3d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
@@ -0,0 +1,203 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha256_mb test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
new file mode 100644
index 000000000..9535d80df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
@@ -0,0 +1,300 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA256_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA256_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_sha256_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha256 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha256_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha256_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha256_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Run sha256_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA256_BLOCK_SIZE +
+ SHA256_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA256_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem,
+ HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha256_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha256_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA256_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha256_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
new file mode 100644
index 000000000..8a5b5a9b2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
@@ -0,0 +1,241 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha256_mb.h"
+
+typedef uint32_t DigestSHA256[SHA256_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static uint8_t msg3[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<";
+static uint8_t msg4[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static uint8_t msg5[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static uint8_t msg6[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static uint8_t msg7[] = "";
+
+static DigestSHA256 expResultDigest1 = { 0x248D6A61, 0xD20638B8, 0xE5C02693, 0x0C3E6039,
+ 0xA33CE459, 0x64FF2167, 0xF6ECEDD4, 0x19DB06C1
+};
+
+static DigestSHA256 expResultDigest2 = { 0xD9C2E699, 0x586B948F, 0x4022C799, 0x4FFE14C6,
+ 0x3A4E8E31, 0x2EE2AEE1, 0xEBE51BED, 0x85705CFD
+};
+
+static DigestSHA256 expResultDigest3 = { 0xE3057651, 0x81295681, 0x7ECF1791, 0xFF9A1619,
+ 0xB2BC5CAD, 0x2AC00018, 0x92AE489C, 0x48DD10B3
+};
+
+static DigestSHA256 expResultDigest4 = { 0x0307DAA3, 0x7130A140, 0x270790F9, 0x95B71407,
+ 0x8EC752A6, 0x084EC1F3, 0xBD873D79, 0x3FF78383
+};
+
+static DigestSHA256 expResultDigest5 = { 0x679312F7, 0x2E18D599, 0x5F51BDC6, 0x4ED56AFD,
+ 0x9B5704D3, 0x4387E11C, 0xC2331089, 0x2CD45DAA
+};
+
+static DigestSHA256 expResultDigest6 = { 0x8B1767E9, 0x7BA7BBE5, 0xF9A6E8D9, 0x9996904F,
+ 0x3AF6562E, 0xA58AF438, 0x5D8D584B, 0x81C808CE
+};
+
+static DigestSHA256 expResultDigest7 = { 0xE3B0C442, 0x98FC1C14, 0x9AFBF4C8, 0x996FB924,
+ 0x27AE41E4, 0x649B934C, 0xA495991B, 0x7852B855
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7
+};
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sha256_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha256_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha256 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..51759d7a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 4000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 20
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha256_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha256_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha256" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..235ec74a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,132 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SHA256_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ uint32_t nlanes;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha256_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha256_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb shortage tests
+ for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < nlanes; i++)
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+ HASH_ENTIRE);
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha256" TEST_TYPE_STR " with %d lanes: ", nlanes);
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < nlanes; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ }
+
+ printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
new file mode 100644
index 000000000..f45669c6e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
@@ -0,0 +1,930 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX512
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, 8*64, 64
+FIELD _rsp, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+%define TBL var2
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddd T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, T1, %%WT ; T1 = T1 + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpaddd D, D, T1 ; D = D + T1
+
+ vprord H, A, 2 ; ROR_2(A)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vmovdqa32 TMP0, A
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+; This is supposed to be SKL optimized assuming:
+; vpternlog, vpaddd ports 5,8
+; vprord ports 1,8
+; However, vprord is only working on port 8
+;
+; Main processing loop per round
+; Get the msg schedule word 16 from the current, now unneccessary word
+%macro PROCESS_LOOP_00_47 5
+%define %%WT %1
+%define %%ROUND %2
+%define %%WTp1 %3
+%define %%WTp9 %4
+%define %%WTp14 %5
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ ;; For next value in msg schedule
+ ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt
+
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, H, %%WT ; T1 = H + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP6 ; T1 = T1 + Kt
+ vprord H, A, 2 ; ROR_2(A)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vmovdqa32 TMP0, A
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpaddd D, D, T1 ; D = D + T1
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+
+ vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+;; void sha256_mb_x16_avx512(SHA256_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha256_mb_x16_avx512)
+sha256_mb_x16_avx512:
+ endbranch
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _rsp], rax
+ lea TBL, [TABLE]
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64]
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+ vmovups F, [DIGEST + 5*64]
+ vmovups G, [DIGEST + 6*64]
+ vmovups H, [DIGEST + 7*64]
+
+ ; Do we need to transpose digests???
+ ; SHA1 does not, but SHA256 has been
+
+ xor IDX, IDX
+
+ ;; Read in first block of input data
+ ;; Transpose input data
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ vmovups W0,[inp0+IDX]
+ vmovups W1,[inp1+IDX]
+ vmovups W2,[inp2+IDX]
+ vmovups W3,[inp3+IDX]
+ vmovups W4,[inp4+IDX]
+ vmovups W5,[inp5+IDX]
+ vmovups W6,[inp6+IDX]
+ vmovups W7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups W8, [inp0+IDX]
+ vmovups W9, [inp1+IDX]
+ vmovups W10,[inp2+IDX]
+ vmovups W11,[inp3+IDX]
+ vmovups W12,[inp4+IDX]
+ vmovups W13,[inp5+IDX]
+ vmovups W14,[inp6+IDX]
+ vmovups W15,[inp7+IDX]
+
+
+lloop:
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 64
+
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first 48 rounds
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 48
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+ ; Check is this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+lastLoop:
+ ; Process last 16 rounds
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+ vmovups [DIGEST + 5*64], F
+ vmovups [DIGEST + 6*64], G
+ vmovups [DIGEST + 7*64], H
+
+
+ mov rsp, [rsp + _rsp]
+ ret
+
+ section .data
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_x16_avx512
+no_sha256_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
new file mode 100644
index 000000000..7f8f8829b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
@@ -0,0 +1,431 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using AVX
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE K256_4_MB
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ4*(%%i&0xf) + rsp], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE 8*SZ4
+%define DATA 16*SZ4
+%define ALIGNMENT 1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define VMOVPS vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : arg2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+mk_global sha256_mb_x4_avx, function, internal
+align 32
+sha256_mb_x4_avx:
+ endbranch
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ vmovdqa a,[arg1+0*SZ4]
+ vmovdqa b,[arg1+1*SZ4]
+ vmovdqa c,[arg1+2*SZ4]
+ vmovdqa d,[arg1+3*SZ4]
+ vmovdqa e,[arg1+4*SZ4]
+ vmovdqa f,[arg1+5*SZ4]
+ vmovdqa g,[arg1+6*SZ4]
+ vmovdqa h,[arg1+7*SZ4]
+
+ lea TBL,[TABLE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPS TT2,[inp0+IDX+i*16]
+ VMOVPS TT1,[inp1+IDX+i*16]
+ VMOVPS TT4,[inp2+IDX+i*16]
+ VMOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
+
+
+ sub arg2, 1
+ jne lloop
+
+ ; write digests out
+ vmovdqa [arg1+0*SZ4],a
+ vmovdqa [arg1+1*SZ4],b
+ vmovdqa [arg1+2*SZ4],c
+ vmovdqa [arg1+3*SZ4],d
+ vmovdqa [arg1+4*SZ4],e
+ vmovdqa [arg1+5*SZ4],f
+ vmovdqa [arg1+6*SZ4],g
+ vmovdqa [arg1+7*SZ4],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+ ret
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
new file mode 100644
index 000000000..2d349abbc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
@@ -0,0 +1,426 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using SSE
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE K256_4_MB
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%reg, %%imm
+ pslld %%tmp, (32-(%%imm))
+ por %%reg, %%tmp
+%endmacro
+
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+
+
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa [SZ4*(%%i&0xf) + rsp],%%T1
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+
+ movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ movdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ movdqa a0, %%T1
+ PRORD %%T1, 18-7
+ movdqa a2, a1
+ PRORD a1, 19-17
+ pxor %%T1, a0
+ PRORD %%T1, 7
+ pxor a1, a2
+ PRORD a1, 17
+ psrld a0, 3
+ pxor %%T1, a0
+ psrld a2, 10
+ pxor a1, a2
+ paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ paddd a1, [SZ4*((%%i-7)&0xf) + rsp]
+ paddd %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE 8*SZ4
+%define DATA 16*SZ4
+%define ALIGNMENT 1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define MOVPS movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+;; void sha256_mb_x4_sse(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+
+mk_global sha256_mb_x4_sse, function, internal
+align 32
+sha256_mb_x4_sse:
+ endbranch
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ movdqa a,[arg1+0*SZ4]
+ movdqa b,[arg1+1*SZ4]
+ movdqa c,[arg1+2*SZ4]
+ movdqa d,[arg1+3*SZ4]
+ movdqa e,[arg1+4*SZ4]
+ movdqa f,[arg1+5*SZ4]
+ movdqa g,[arg1+6*SZ4]
+ movdqa h,[arg1+7*SZ4]
+
+ lea TBL,[TABLE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ movdqa [rsp + _DIGEST + 0*SZ4], a
+ movdqa [rsp + _DIGEST + 1*SZ4], b
+ movdqa [rsp + _DIGEST + 2*SZ4], c
+ movdqa [rsp + _DIGEST + 3*SZ4], d
+ movdqa [rsp + _DIGEST + 4*SZ4], e
+ movdqa [rsp + _DIGEST + 5*SZ4], f
+ movdqa [rsp + _DIGEST + 6*SZ4], g
+ movdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ MOVPS TT2,[inp0+IDX+i*16]
+ MOVPS TT1,[inp1+IDX+i*16]
+ MOVPS TT4,[inp2+IDX+i*16]
+ MOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ pshufb TT0, TMP
+ pshufb TT1, TMP
+ pshufb TT2, TMP
+ pshufb TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ paddd a, [rsp + _DIGEST + 0*SZ4]
+ paddd b, [rsp + _DIGEST + 1*SZ4]
+ paddd c, [rsp + _DIGEST + 2*SZ4]
+ paddd d, [rsp + _DIGEST + 3*SZ4]
+ paddd e, [rsp + _DIGEST + 4*SZ4]
+ paddd f, [rsp + _DIGEST + 5*SZ4]
+ paddd g, [rsp + _DIGEST + 6*SZ4]
+ paddd h, [rsp + _DIGEST + 7*SZ4]
+
+
+ sub arg2, 1
+ jne lloop
+
+ ; write digests out
+ movdqa [arg1+0*SZ4],a
+ movdqa [arg1+1*SZ4],b
+ movdqa [arg1+2*SZ4],c
+ movdqa [arg1+3*SZ4],d
+ movdqa [arg1+4*SZ4],e
+ movdqa [arg1+5*SZ4],f
+ movdqa [arg1+6*SZ4],g
+ movdqa [arg1+7*SZ4],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+ ret
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
new file mode 100644
index 000000000..dbd9db1b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
@@ -0,0 +1,620 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rsi
+ %define reg4 rdi
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+; ymm0 a
+; ymm1 b
+; ymm2 c
+; ymm3 d
+; ymm4 e
+; ymm5 f
+; ymm6 g TMP0
+; ymm7 h TMP1
+; ymm8 T1 TT0
+; ymm9 TT1
+; ymm10 TT2
+; ymm11 TT3
+; ymm12 a0 TT4
+; ymm13 a1 TT5
+; ymm14 a2 TT6
+; ymm15 TMP TT7
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define T1 ymm8
+
+%define a0 ymm12
+%define a1 ymm13
+%define a2 ymm14
+%define TMP ymm15
+
+%define TMP0 ymm6
+%define TMP1 ymm7
+
+%define TT0 ymm8
+%define TT1 ymm9
+%define TT2 ymm10
+%define TT3 ymm11
+%define TT4 ymm12
+%define TT5 ymm13
+%define TT6 ymm14
+%define TT7 ymm15
+
+%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 64*SZ8
+%define PTR_SZ 8
+%define SHA256_DIGEST_WORD_SIZE 4
+%define MAX_SHA256_LANES 8
+%define NUM_SHA256_DIGEST_WORDS 8
+%define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ8
+ .digest resb 8*SZ8
+ .ytmp resb 4*SZ8
+ .rsp resb 8
+endstruc
+%define FRAMESZ stack_frame_size
+%define _DIGEST stack_frame.digest
+%define _YTMP stack_frame.ytmp
+%define _RSP_SAVE stack_frame.rsp
+
+%define YTMP0 rsp + _YTMP + 0*SZ8
+%define YTMP1 rsp + _YTMP + 1*SZ8
+%define YTMP2 rsp + _YTMP + 2*SZ8
+%define YTMP3 rsp + _YTMP + 3*SZ8
+
+%define VMOVPS vmovups
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
+ vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of input in blocks
+mk_global sha256_mb_x8_avx2, function, internal
+align 16
+sha256_mb_x8_avx2:
+ endbranch
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ ; save rsp, allocate 32-byte aligned for local variables
+ mov IDX, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~31
+ mov [rsp + _RSP_SAVE], IDX
+
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL,[K256_8_MB]
+
+ ;; load the address of each of the 4 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+ mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+ mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+ mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+ mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+ mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+ mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+ mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ8], a
+ vmovdqa [rsp + _DIGEST + 1*SZ8], b
+ vmovdqa [rsp + _DIGEST + 2*SZ8], c
+ vmovdqa [rsp + _DIGEST + 3*SZ8], d
+ vmovdqa [rsp + _DIGEST + 4*SZ8], e
+ vmovdqa [rsp + _DIGEST + 5*SZ8], f
+ vmovdqa [rsp + _DIGEST + 6*SZ8], g
+ vmovdqa [rsp + _DIGEST + 7*SZ8], h
+%assign i 0
+%rep 2
+ VMOVPS TT0,[inp0+IDX+i*32]
+ VMOVPS TT1,[inp1+IDX+i*32]
+ VMOVPS TT2,[inp2+IDX+i*32]
+ VMOVPS TT3,[inp3+IDX+i*32]
+ VMOVPS TT4,[inp4+IDX+i*32]
+ VMOVPS TT5,[inp5+IDX+i*32]
+ VMOVPS TT6,[inp6+IDX+i*32]
+ VMOVPS TT7,[inp7+IDX+i*32]
+ vmovdqa [YTMP0], g
+ vmovdqa [YTMP1], h
+ TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
+ vmovdqa TMP1, [PSHUFFLE_BYTE_FLIP_MASK]
+ vmovdqa g, [YTMP0]
+ vpshufb TT0, TT0, TMP1
+ vpshufb TT1, TT1, TMP1
+ vpshufb TT2, TT2, TMP1
+ vpshufb TT3, TT3, TMP1
+ vpshufb TT4, TT4, TMP1
+ vpshufb TT5, TT5, TMP1
+ vpshufb TT6, TT6, TMP1
+ vpshufb TT7, TT7, TMP1
+ vmovdqa h, [YTMP1]
+ vmovdqa [YTMP0], TT4
+ vmovdqa [YTMP1], TT5
+ vmovdqa [YTMP2], TT6
+ vmovdqa [YTMP3], TT7
+ ROUND_00_15 TT0,(i*8+0)
+ vmovdqa TT0, [YTMP0]
+ ROUND_00_15 TT1,(i*8+1)
+ vmovdqa TT1, [YTMP1]
+ ROUND_00_15 TT2,(i*8+2)
+ vmovdqa TT2, [YTMP2]
+ ROUND_00_15 TT3,(i*8+3)
+ vmovdqa TT3, [YTMP3]
+ ROUND_00_15 TT0,(i*8+4)
+ ROUND_00_15 TT1,(i*8+5)
+ ROUND_00_15 TT2,(i*8+6)
+ ROUND_00_15 TT3,(i*8+7)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+%assign i (i*8)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a
+ vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b
+ vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c
+ vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d
+ vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e
+ vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f
+ vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g
+ vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _args_data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _args_data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _args_data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _args_data_ptr + 3*8], inp3
+ add inp4, IDX
+ mov [STATE + _args_data_ptr + 4*8], inp4
+ add inp5, IDX
+ mov [STATE + _args_data_ptr + 5*8], inp5
+ add inp6, IDX
+ mov [STATE + _args_data_ptr + 6*8], inp6
+ add inp7, IDX
+ mov [STATE + _args_data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+section .data
+align 64
+K256_8_MB:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
new file mode 100644
index 000000000..af54f7cc3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
@@ -0,0 +1,125 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha256_ctx_mgr_init_sse
+extern sha256_ctx_mgr_submit_sse
+extern sha256_ctx_mgr_flush_sse
+
+extern sha256_ctx_mgr_init_avx
+extern sha256_ctx_mgr_submit_avx
+extern sha256_ctx_mgr_flush_avx
+
+extern sha256_ctx_mgr_init_avx2
+extern sha256_ctx_mgr_submit_avx2
+extern sha256_ctx_mgr_flush_avx2
+
+extern sha256_ctx_mgr_init_base
+extern sha256_ctx_mgr_submit_base
+extern sha256_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha256_ctx_mgr_init_avx512
+ extern sha256_ctx_mgr_submit_avx512
+ extern sha256_ctx_mgr_flush_avx512
+%endif
+
+%ifdef HAVE_AS_KNOWS_SHANI
+ extern sha256_ctx_mgr_init_sse_ni
+ extern sha256_ctx_mgr_submit_sse_ni
+ extern sha256_ctx_mgr_flush_sse_ni
+%endif
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+ extern sha256_ctx_mgr_init_avx512_ni
+ extern sha256_ctx_mgr_submit_avx512_ni
+ extern sha256_ctx_mgr_flush_avx512_ni
+ %endif
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6's extension through replacing base by sse version
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+ sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+ sha256_ctx_mgr_init_avx512, sha256_ctx_mgr_init_sse_ni, sha256_ctx_mgr_init_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+ sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+ sha256_ctx_mgr_submit_avx512, sha256_ctx_mgr_submit_sse_ni, sha256_ctx_mgr_submit_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+ sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+ sha256_ctx_mgr_flush_avx512, sha256_ctx_mgr_flush_sse_ni, sha256_ctx_mgr_flush_avx512_ni
+ %else
+ mbin_dispatch_init6 sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+ sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+ sha256_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+ sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+ sha256_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+ sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+ sha256_ctx_mgr_flush_avx512
+ %endif
+%else
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+ sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, sha256_ctx_mgr_init_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+ sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, sha256_ctx_mgr_submit_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+ sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, sha256_ctx_mgr_flush_sse_ni
+ %else
+ mbin_dispatch_init sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+ sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2
+ mbin_dispatch_init sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+ sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2
+ mbin_dispatch_init sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+ sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2
+ %endif
+%endif
+
+;;; func core, ver, snum
+slversion sha256_ctx_mgr_init, 00, 04, 0160
+slversion sha256_ctx_mgr_submit, 00, 04, 0161
+slversion sha256_ctx_mgr_flush, 00, 04, 0162
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
new file mode 100644
index 000000000..25fc9ce16
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
@@ -0,0 +1,361 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+%define MSG xmm0
+%define STATE0 xmm1
+%define STATE1 xmm2
+%define MSGTMP0 xmm3
+%define MSGTMP1 xmm4
+%define MSGTMP2 xmm5
+%define MSGTMP3 xmm6
+%define MSGTMP4 xmm7
+
+%define SHUF_MASK xmm8
+
+%define ABEF_SAVE xmm9
+%define CDGH_SAVE xmm10
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define TMP r9 ; local variable -- assistant to address digest
+%define TBL rax
+;%define TMP2 r8 ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x1(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r11, xmm0-xmm10
+;
+mk_global sha256_ni_x1, function, internal
+sha256_ni_x1:
+ endbranch
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + 4*IDX]
+ ;; Initialize digest
+ ;; digests -> ABEF(state0), CDGH(state1)
+ pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A
+ pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B
+ pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D
+ pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E
+ pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F
+ pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+ lea TBL, [TABLE]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr + IDX*8]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ ; /* Save hash values for addition after rounds */
+ movdqa ABEF_SAVE, STATE0
+ movdqa CDGH_SAVE, STATE1
+
+ ; /* Rounds 0-3 */
+ movdqu MSG, [DPTR + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0, MSG
+ paddd MSG, [TBL + 0*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Rounds 4-7 */
+ movdqu MSG, [DPTR + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1, MSG
+ paddd MSG, [TBL + 1*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ ; /* Rounds 8-11 */
+ movdqu MSG, [DPTR + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2, MSG
+ paddd MSG, [TBL + 2*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ ; /* Rounds 12-15 */
+ movdqu MSG, [DPTR + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3, MSG
+ paddd MSG, [TBL + 3*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ ; /* Rounds 16-19 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 4*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ ; /* Rounds 20-23 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 5*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ ; /* Rounds 24-27 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 6*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ ; /* Rounds 28-31 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 7*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ ; /* Rounds 32-35 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 8*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ ; /* Rounds 36-39 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 9*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ ; /* Rounds 40-43 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 10*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ ; /* Rounds 44-47 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 11*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ ; /* Rounds 48-51 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 12*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ ; /* Rounds 52-55 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 13*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Rounds 56-59 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 14*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Rounds 60-63 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 15*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Add current hash values with previously saved */
+ paddd STATE0, ABEF_SAVE
+ paddd STATE1, CDGH_SAVE
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + 4*IDX]
+ ;; ABEF(state0), CDGH(state1) -> digests
+ pextrd [TMP + 0*NLANX4], STATE0, 3 ; A
+ pextrd [TMP + 1*NLANX4], STATE0, 2 ; B
+ pextrd [TMP + 2*NLANX4], STATE1, 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], STATE1, 2 ; D
+ pextrd [TMP + 2*NLANX4], STATE0, 1 ; E
+ pextrd [TMP + 4*NLANX4], STATE1, 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pextrd [TMP + 2*NLANX4], STATE0, 0 ; F
+ pextrd [TMP + 4*NLANX4], STATE1, 0 ; H
+
+ ; update input pointers
+ mov [MGR + _data_ptr + IDX*8], DPTR
+
+backto_mgr:
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ret
+
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x1
+no_sha256_ni_x1:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
new file mode 100644
index 000000000..74cfc93b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
@@ -0,0 +1,574 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ 64 ; space for ABCDE
+%define RSPSAVE rax
+
+%define MSG xmm0
+%define STATE0 xmm1
+%define STATE1 xmm2
+%define MSGTMP0 xmm3
+%define MSGTMP1 xmm4
+%define MSGTMP2 xmm5
+%define MSGTMP3 xmm6
+%define MSGTMP4 xmm7
+
+%define STATE0b xmm8
+%define STATE1b xmm9
+%define MSGTMP0b xmm10
+%define MSGTMP1b xmm11
+%define MSGTMP2b xmm12
+%define MSGTMP3b xmm13
+%define MSGTMP4b xmm14
+
+%define SHUF_MASK xmm15
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define DPTRb r12
+%define TMP r9 ; local variable -- assistant to address digest
+%define TBL r13
+%define TMPb r14 ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r14, xmm0-xmm15
+;
+mk_global sha256_ni_x2, function, internal
+sha256_ni_x2:
+ endbranch
+ mov RSPSAVE, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~0xF ; Align 16Bytes downward
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + 4*0]
+ lea TMPb, [MGR + 4*1]
+
+ ;; Initialize digest
+ ;; digests -> ABEF(state0), CDGH(state1)
+ pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A
+ pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B
+ pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D
+ pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E
+ pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F
+ pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H
+
+ pinsrd STATE0b, [TMPb + 0*NLANX4], 3 ; A
+ pinsrd STATE0b, [TMPb + 1*NLANX4], 2 ; B
+ pinsrd STATE1b, [TMPb + 2*NLANX4], 3 ; C
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd STATE1b, [TMPb + 1*NLANX4], 2 ; D
+ pinsrd STATE0b, [TMPb + 2*NLANX4], 1 ; E
+ pinsrd STATE1b, [TMPb + 4*NLANX4], 1 ; G
+ lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pinsrd STATE0b, [TMPb + 2*NLANX4], 0 ; F
+ pinsrd STATE1b, [TMPb + 4*NLANX4], 0 ; H
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+ lea TBL, [TABLE]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr + 8*0]
+ mov DPTRb,[MGR + _data_ptr + 8*1]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ ; /* Save hash values for addition after rounds */
+ movdqa [rsp + 0*16], STATE0
+ movdqa [rsp + 1*16], STATE1
+
+ movdqa [rsp + 2*16], STATE0b
+ movdqa [rsp + 3*16], STATE1b
+
+ ; /* Rounds 0-3 */
+ movdqu MSG, [DPTR + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0, MSG
+ paddd MSG, [TBL + 0*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqu MSG, [DPTRb + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0b, MSG
+ paddd MSG, [TBL + 0*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Rounds 4-7 */
+ movdqu MSG, [DPTR + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1, MSG
+ paddd MSG, [TBL + 1*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ movdqu MSG, [DPTRb + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1b, MSG
+ paddd MSG, [TBL + 1*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ; /* Rounds 8-11 */
+ movdqu MSG, [DPTR + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2, MSG
+ paddd MSG, [TBL + 2*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ movdqu MSG, [DPTRb + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2b, MSG
+ paddd MSG, [TBL + 2*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ; /* Rounds 12-15 */
+ movdqu MSG, [DPTR + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3, MSG
+ paddd MSG, [TBL + 3*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ movdqu MSG, [DPTRb + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3b, MSG
+ paddd MSG, [TBL + 3*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP3b
+ palignr MSGTMP4b, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP4b
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ; /* Rounds 16-19 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 4*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [TBL + 4*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP0b
+ palignr MSGTMP4b, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP4b
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ; /* Rounds 20-23 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 5*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [TBL + 5*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP1b
+ palignr MSGTMP4b, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP4b
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ; /* Rounds 24-27 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 6*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [TBL + 6*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP2b
+ palignr MSGTMP4b, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP4b
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ; /* Rounds 28-31 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 7*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [TBL + 7*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP3b
+ palignr MSGTMP4b, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP4b
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ; /* Rounds 32-35 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 8*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [TBL + 8*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP0b
+ palignr MSGTMP4b, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP4b
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ; /* Rounds 36-39 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 9*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [TBL + 9*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP1b
+ palignr MSGTMP4b, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP4b
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ; /* Rounds 40-43 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 10*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [TBL + 10*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP2b
+ palignr MSGTMP4b, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP4b
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ; /* Rounds 44-47 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 11*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [TBL + 11*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP3b
+ palignr MSGTMP4b, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP4b
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ; /* Rounds 48-51 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 12*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [TBL + 12*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP0b
+ palignr MSGTMP4b, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP4b
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ; /* Rounds 52-55 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 13*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [TBL + 13*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP1b
+ palignr MSGTMP4b, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP4b
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Rounds 56-59 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 14*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [TBL + 14*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP2b
+ palignr MSGTMP4b, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP4b
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Rounds 60-63 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 15*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [TBL + 15*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Add current hash values with previously saved */
+ paddd STATE0, [rsp + 0*16]
+ paddd STATE1, [rsp + 1*16]
+
+ paddd STATE0b, [rsp + 2*16]
+ paddd STATE1b, [rsp + 3*16]
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ add DPTRb, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + 4*0]
+ ;; ABEF(state0), CDGH(state1) -> digests
+ pextrd [TMP + 0*NLANX4], STATE0, 3 ; A
+ pextrd [TMP + 1*NLANX4], STATE0, 2 ; B
+ pextrd [TMP + 2*NLANX4], STATE1, 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], STATE1, 2 ; D
+ pextrd [TMP + 2*NLANX4], STATE0, 1 ; E
+ pextrd [TMP + 4*NLANX4], STATE1, 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pextrd [TMP + 2*NLANX4], STATE0, 0 ; F
+ pextrd [TMP + 4*NLANX4], STATE1, 0 ; H
+
+ lea TMPb, [MGR + 4*1]
+ ;; ABEF(state0), CDGH(state1) -> digests
+ pextrd [TMPb + 0*NLANX4], STATE0b, 3 ; A
+ pextrd [TMPb + 1*NLANX4], STATE0b, 2 ; B
+ pextrd [TMPb + 2*NLANX4], STATE1b, 3 ; C
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMPb + 1*NLANX4], STATE1b, 2 ; D
+ pextrd [TMPb + 2*NLANX4], STATE0b, 1 ; E
+ pextrd [TMPb + 4*NLANX4], STATE1b, 1 ; G
+ lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pextrd [TMPb + 2*NLANX4], STATE0b, 0 ; F
+ pextrd [TMPb + 4*NLANX4], STATE1b, 0 ; H
+
+ ; update input pointers
+ mov [MGR + _data_ptr + 0*8], DPTR
+ mov [MGR + _data_ptr + 1*8], DPTRb
+
+backto_mgr:
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ mov rsp, RSPSAVE
+
+ ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x2
+no_sha256_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
new file mode 100644
index 000000000..fc13ec279
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
@@ -0,0 +1,567 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+;
+; Copyright (C) 2013 Intel Corporation.
+;
+; Authors:
+; James Guilford <james.guilford@intel.com>
+; Kirk Yap <kirk.s.yap@intel.com>
+; Tim Chen <tim.c.chen@linux.intel.com>
+; Transcoded by:
+; Xiaodong Liu <xiaodong.liu@intel.com>
+;
+; This software is available to you under the OpenIB.org BSD license
+; below:
+;
+; Redistribution and use in source and binary forms, with or
+; without modification, are permitted provided that the following
+; conditions are met:
+;
+; - Redistributions of source code must retain the above
+; copyright notice, this list of conditions and the following
+; disclaimer.
+;
+; - Redistributions in binary form must reproduce the above
+; copyright notice, this list of conditions and the following
+; disclaimer in the documentation and/or other materials
+; provided with the distribution.
+;
+; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+; SOFTWARE.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+%xdefine X0 xmm4
+%xdefine X1 xmm5
+%xdefine X2 xmm6
+%xdefine X3 xmm7
+
+%xdefine XTMP0 xmm0
+%xdefine XTMP1 xmm1
+%xdefine XTMP2 xmm2
+%xdefine XTMP3 xmm3
+%xdefine XTMP4 xmm8
+%xdefine XFER xmm9
+
+%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm12
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0 ; rdi or rcx
+%define NBLK arg1 ; rsi or rdx
+%define IDX r8 ; local variable -- consistent with caller
+%define NLANX4 r10 ; consistent with caller, should be r10
+
+%define TMGR r9 ; data pointer stored in stack named _TMGR
+%define INP r9 ; data pointer stored in stack named _INP
+%define SRND r9 ; clobbers INP
+%define TMP r9 ; local variable -- assistant to address digest
+
+%xdefine TBL rbp
+%xdefine c ecx
+%xdefine d esi
+%xdefine e edx
+%xdefine a eax
+%xdefine b ebx
+
+%xdefine f edi
+%xdefine g r12d
+%xdefine h r11d
+
+%xdefine y0 r13d
+%xdefine y1 r14d
+%xdefine y2 r15d
+
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define _STACK_ALIGN_SIZE 8 ; 0 or 8 depends on pushes
+%define _INP_END_SIZE 8
+%define _INP_SIZE 8
+%define _TMGR_SIZE 8
+%define _XFER_SIZE 16
+%define _XMM_SAVE_SIZE 0
+%define _GPR_SAVE_SIZE 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15
+
+%define _STACK_ALIGN 0
+%define _INP_END (_STACK_ALIGN + _STACK_ALIGN_SIZE)
+%define _INP (_INP_END + _INP_END_SIZE)
+%define _TMGR (_INP + _INP_SIZE)
+%define _XFER (_TMGR + _TMGR_SIZE)
+%define _XMM_SAVE (_XFER + _XFER_SIZE)
+%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE)
+%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE)
+
+;; assume buffers not aligned
+%define MOVDQ movdqu
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+ add %2, %1 ;changed
+ mov %1, %2 ;changed
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ MOVDQ %1, %2 ;changed
+ pshufb %1, %3 ;changed
+%endmacro
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endmacro
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endmacro
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ movdqa XTMP0, X3
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa XTMP1, X1
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ;; compute s0
+ palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2 , [rsp + _XFER] ; y2 = k + w + S1 + CH
+ movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pslld XTMP1, (32-7) ;
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ psrld XTMP2, 7 ;
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ pslld XTMP3, (32-18) ;
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ psrld XTMP2, 18 ;
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP1, XTMP3
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pxor XTMP1, XTMP4 ; XTMP1 = s0
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ ;; compute low s1
+ pshufd XTMP2, X3, 11111010B ; XTMP2 = W[-2] {BBAA}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP2, XTMP3
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH
+ pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ ;; compute high s1
+ pshufd XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA}
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ pxor XTMP2, XTMP3 ;
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH
+ pxor X0, XTMP2 ; X0 = s1 {xDxC}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ rotate_Xs
+%endmacro
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ %xdefine offset (%1 * 4 + _XFER)
+ add y2, [rsp + offset] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs, xmm0-xmm12
+; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+mk_global sha256_opt_x1, function, internal
+sha256_opt_x1:
+ endbranch
+ sub rsp, STACK_SIZE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ mov [rsp + _GPR_SAVE + 8*3], rsi
+ ; caller has already stored XMM6~10
+%endif
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+ mov [rsp + _GPR_SAVE + 8*8], rdx
+
+ shl NBLK, 6 ; convert to bytes
+ jz done_hash
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ mov [rsp + _TMGR], MGR
+ ;; Load input pointers
+ mov INP, [MGR + _data_ptr + IDX*8]
+ mov [rsp + _INP], INP
+ ;; nblk is used to indicate data end
+ add NBLK, INP
+ mov [rsp + _INP_END], NBLK ; pointer to end of data
+
+
+ mov TMGR, [rsp + _TMGR]
+ ;; load initial digest
+ lea TMP, [TMGR + 4*IDX]
+ mov a, [TMP + 0*NLANX4]
+ mov b, [TMP + 1*NLANX4]
+ mov c, [TMP + 2*NLANX4]
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ mov d, [TMP + 1*NLANX4]
+ mov e, [TMP + 2*NLANX4]
+ mov g, [TMP + 4*NLANX4]
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4
+ mov f, [TMP + 2*NLANX4]
+ mov h, [TMP + 4*NLANX4]
+
+ movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+ movdqa SHUF_00BA, [_SHUF_00BA]
+ movdqa SHUF_DC00, [_SHUF_DC00]
+
+ mov INP, [rsp + _INP]
+loop0:
+ lea TBL, [K256]
+
+ ;; byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+
+ mov [rsp + _INP], INP
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov SRND, 3
+
+loop1:
+ movdqa XFER, [TBL]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 1*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 2*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 3*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ add TBL, 4*16
+ FOUR_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+loop2:
+ paddd X0, [TBL]
+ movdqa [rsp + _XFER], X0
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd X1, [TBL + 1*16]
+ movdqa [rsp + _XFER], X1
+ add TBL, 2*16
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X0, X2
+ movdqa X1, X3
+
+ sub SRND, 1
+ jne loop2
+
+ ; write out digests
+ mov TMGR, [rsp + _TMGR]
+ lea TMP, [TMGR + 4*IDX]
+ addm a, [TMP + 0*NLANX4]
+ addm b, [TMP + 1*NLANX4]
+ addm c, [TMP + 2*NLANX4]
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ addm d, [TMP + 1*NLANX4]
+ addm e, [TMP + 2*NLANX4]
+ addm g, [TMP + 4*NLANX4]
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4
+ addm f, [TMP + 2*NLANX4]
+ addm h, [TMP + 4*NLANX4]
+
+ mov INP, [rsp + _INP]
+ add INP, 64
+ cmp INP, [rsp + _INP_END]
+ jne loop0
+
+done_hash:
+ mov MGR, [rsp + _TMGR]
+
+ mov rdx, [rsp + _GPR_SAVE + 8*8]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + _GPR_SAVE + 8*3]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbp, [rsp + _GPR_SAVE + 8*1]
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data
+align 64
+K256:
+ DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ DQ 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:
+ DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:
+ DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
new file mode 100644
index 000000000..c3515dc52
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
@@ -0,0 +1,204 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define H0 0x6a09e667
+#define H1 0xbb67ae85
+#define H2 0x3c6ef372
+#define H3 0xa54ff53a
+#define H4 0x510e527f
+#define H5 0x9b05688c
+#define H6 0x1f83d9ab
+#define H7 0x5be0cd19
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+static void OPT_FIX sha256_single(const uint8_t * data, uint32_t digest[]);
+
+void sha256_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+ digest[5] = H5;
+ digest[6] = H6;
+ digest[7] = H7;
+
+ i = len;
+ while (i >= SHA256_BLOCK_SIZE) {
+ sha256_single(input_data, digest);
+ input_data += SHA256_BLOCK_SIZE;
+ i -= SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha256_single(buf, digest);
+ if (i == 2 * SHA256_BLOCK_SIZE)
+ sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+void sha256_single(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}