49 files changed, 12711 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
new file mode 100644
index 000000000..9405c2469
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
@@ -0,0 +1,127 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += 	sha256_mb/sha256_ctx_sse.c \
+		sha256_mb/sha256_ctx_avx.c \
+		sha256_mb/sha256_ctx_avx2.c \
+		sha256_mb/sha256_ctx_base.c
+
+lsrc_x86_64 += 	sha256_mb/sha256_mb_mgr_init_sse.c \
+		sha256_mb/sha256_mb_mgr_init_avx2.c
+
+
+lsrc_x86_64 += 	sha256_mb/sha256_mb_mgr_submit_sse.asm \
+		sha256_mb/sha256_mb_mgr_submit_avx.asm \
+		sha256_mb/sha256_mb_mgr_submit_avx2.asm \
+		sha256_mb/sha256_mb_mgr_flush_sse.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx2.asm \
+		sha256_mb/sha256_mb_x4_sse.asm \
+		sha256_mb/sha256_mb_x4_avx.asm \
+		sha256_mb/sha256_mb_x8_avx2.asm \
+		sha256_mb/sha256_multibinary.asm
+
+lsrc_x86_64 += 	sha256_mb/sha256_ctx_avx512.c \
+		sha256_mb/sha256_mb_mgr_init_avx512.c \
+		sha256_mb/sha256_mb_mgr_submit_avx512.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx512.asm \
+		sha256_mb/sha256_mb_x16_avx512.asm
+
+lsrc_x86_64 += 	sha256_mb/sha256_opt_x1.asm
+
+lsrc_x86_64 += 	sha256_mb/sha256_ni_x1.asm \
+		sha256_mb/sha256_ni_x2.asm \
+		sha256_mb/sha256_ctx_sse_ni.c \
+		sha256_mb/sha256_ctx_avx512_ni.c \
+		sha256_mb/sha256_mb_mgr_submit_sse_ni.asm \
+		sha256_mb/sha256_mb_mgr_flush_sse_ni.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
+
+lsrc_x86_32 += 	$(lsrc_x86_64)
+
+lsrc_aarch64 += sha256_mb/sha256_ctx_base.c \
+		sha256_mb/sha256_ref.c
+
+lsrc_aarch64 += sha256_mb/aarch64/sha256_mb_multibinary.S \
+		sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c  \
+		sha256_mb/aarch64/sha256_ctx_ce.c	\
+		sha256_mb/aarch64/sha256_mb_mgr_ce.c	\
+		sha256_mb/aarch64/sha256_mb_x1_ce.S	\
+		sha256_mb/aarch64/sha256_mb_x2_ce.S	\
+		sha256_mb/aarch64/sha256_mb_x3_ce.S	\
+		sha256_mb/aarch64/sha256_mb_x4_ce.S
+
+
+lsrc_base_aliases += sha256_mb/sha256_ctx_base_aliases.c	\
+		sha256_mb/sha256_ctx_base.c	\
+		sha256_mb/sha256_ref.c
+
+src_include += -I $(srcdir)/sha256_mb
+
+extern_hdrs +=  include/sha256_mb.h \
+		include/multi_buffer.h
+
+other_src += 	include/datastruct.asm \
+		include/multibinary.asm \
+		sha256_mb/sha256_job.asm \
+		sha256_mb/sha256_mb_mgr_datastruct.asm \
+		include/reg_sizes.asm \
+		sha256_mb/sha256_ref.c \
+		include/memcpy_inline.h \
+		include/memcpy.asm \
+		include/intrinreg.h
+
+check_tests  +=	sha256_mb/sha256_mb_test  \
+		sha256_mb/sha256_mb_rand_test  \
+		sha256_mb/sha256_mb_rand_update_test \
+		sha256_mb/sha256_mb_flush_test
+
+unit_tests   += sha256_mb/sha256_mb_rand_ssl_test
+
+perf_tests  +=  sha256_mb/sha256_mb_vs_ossl_perf \
+		sha256_mb/sha256_mb_vs_ossl_shortage_perf
+
+sha256_mb_rand_ssl_test: sha256_ref.o
+sha256_mb_rand_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_update_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_update_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_flush_test: sha256_ref.o
+sha256_mb_sha256_mb_flush_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
new file mode 100644
index 000000000..4776f55bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state);
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job);
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_ce(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					  const void *buffer, uint32_t len,
+					  HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_ce(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr,
+										  &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx =
+			    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_ce_slver_02020142;
+struct slver sha256_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha256_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha256_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..8627991c3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_submit)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(sha256_ctx_mgr_submit_ce);
+
+	return PROVIDER_BASIC(sha256_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_init)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(sha256_ctx_mgr_init_ce);
+
+	return PROVIDER_BASIC(sha256_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_flush)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(sha256_ctx_mgr_flush_ce);
+
+	return PROVIDER_BASIC(sha256_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
new file mode 100644
index 000000000..aa63c4dd8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha256_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA256_MB_CE_MAX_LANES	3
+
+#if SHA256_MB_CE_MAX_LANES >=4
+void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+#endif
+void sha256_mb_ce_x1(SHA256_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state)
+{
+	int i;
+
+	state->unused_lanes = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = SHA256_MB_CE_MAX_LANES - 1; i >= 0; i--) {
+		state->unused_lanes <<= 4;
+		state->unused_lanes |= i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	//lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+	for (i = SHA256_MB_CE_MAX_LANES; i < SHA256_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int sha256_mb_mgr_do_jobs(SHA256_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i, lanes;
+
+	int lane_idx_array[SHA256_MAX_LANES];
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+#if SHA256_MB_CE_MAX_LANES == 4
+	if (state->num_lanes_inuse == 4) {
+		len = min(min(state->lens[0], state->lens[1]),
+			  min(state->lens[2], state->lens[3]));
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha256_mb_ce_x4(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane,
+				state->ldata[2].job_in_lane,
+				state->ldata[3].job_in_lane, len >> 4);
+
+	} else
+#elif SHA256_MB_CE_MAX_LANES == 3
+	if (state->num_lanes_inuse == 3) {
+		len = min(min(state->lens[0], state->lens[1]), state->lens[2]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha256_mb_ce_x3(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane,
+				state->ldata[2].job_in_lane, len >> 4);
+
+	} else
+#elif	SHA256_MB_CE_MAX_LANES == 2
+	if (state->num_lanes_inuse == 2) {
+		len = min(state->lens[0], state->lens[1]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha256_mb_ce_x2(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane, len >> 4);
+
+	} else
+#endif
+	{
+		lanes = 0, len = 0;
+		for (i = 0; i < SHA256_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				if (lanes)
+					len = min(len, state->lens[i]);
+				else
+					len = state->lens[i];
+				lane_idx_array[lanes] = i;
+				lanes++;
+			}
+		}
+		if (lanes == 0)
+			return -1;
+		lane_idx = len & 0xf;
+		len = len & (~0xf);
+#if SHA256_MB_CE_MAX_LANES >=4
+		if (lanes == 4) {
+			sha256_mb_ce_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+					state->ldata[lane_idx_array[1]].job_in_lane,
+					state->ldata[lane_idx_array[2]].job_in_lane,
+					state->ldata[lane_idx_array[3]].job_in_lane, len >> 4);
+
+		} else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+		if (lanes == 3) {
+			sha256_mb_ce_x3(state->ldata[lane_idx_array[0]].job_in_lane,
+					state->ldata[lane_idx_array[1]].job_in_lane,
+					state->ldata[lane_idx_array[2]].job_in_lane, len >> 4);
+		} else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+		if (lanes == 2) {
+			sha256_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+					state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+		} else
+#endif
+		{
+			sha256_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+		}
+	}
+	//only return the min length job
+	for (i = 0; i < SHA256_MAX_LANES; i++) {
+		if (LANE_IS_NOT_FINISHED(state, i)) {
+			state->lens[i] -= len;
+			state->ldata[i].job_in_lane->len -= len;
+			state->ldata[i].job_in_lane->buffer += len << 2;
+		}
+	}
+
+	return lane_idx;
+
+}
+
+static SHA256_JOB *sha256_mb_mgr_free_lane(SHA256_MB_JOB_MGR * state)
+{
+	int i;
+	SHA256_JOB *ret = NULL;
+
+	for (i = 0; i < SHA256_MB_CE_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+
+			state->unused_lanes <<= 4;
+			state->unused_lanes |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void sha256_mb_mgr_insert_job(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+	int lane_idx;
+	//add job into lanes
+	lane_idx = state->unused_lanes & 0xf;
+	//fatal error
+	assert(lane_idx < SHA256_MB_CE_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes >>= 4;
+	state->num_lanes_inuse++;
+}
+
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	SHA256_JOB *ret;
+
+	//add job into lanes
+	sha256_mb_mgr_insert_job(state, job);
+
+	ret = sha256_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	//submit will wait all lane has data
+	if (state->num_lanes_inuse < SHA256_MB_CE_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = sha256_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	sha256_mb_mgr_do_jobs(state);
+#endif
+
+	//~ i = lane_idx;
+	ret = sha256_mb_mgr_free_lane(state);
+	return ret;
+}
+
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state)
+{
+	SHA256_JOB *ret;
+	ret = sha256_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	sha256_mb_mgr_do_jobs(state);
+	return sha256_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
new file mode 100644
index 000000000..ecc5fc5f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include <aarch64_multibinary.h>
+
+
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
new file mode 100644
index 000000000..06d0ab5fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
@@ -0,0 +1,238 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+	ldr		key_q , [tmp]
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	add		tmp,tmp,16
+	add		l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	sha256h		l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0,\tmp1
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l0_abcd_saved,5
+	declare_var_vector_reg	l0_efgh_saved,6
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,2
+	declare_var_vector_reg	l0_tmp1,3
+	declare_var_vector_reg	l0_tmp2,4
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+
+
+
+/*
+	void sha256_mb_ce_x1(SHA1_JOB * l0_job, int len);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	len	.req	w1
+	l0_data	.req	x2
+	tmp	.req	x3
+	.global	sha256_mb_ce_x1
+	.type	sha256_mb_ce_x1, %function
+sha256_mb_ce_x1:
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+
+
+
+start_loop:
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	sub	len, len, #1
+	cmp	len, 0
+	//backup digest
+	mov	l0_abcd_saved_v.16b,l0_abcd_v.16b
+	mov	l0_efgh_saved_v.16b,l0_efgh_v.16b
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s,l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_high	msg1,tmp0,tmp1			/* rounds 48-51 */
+	sha256_4_rounds_high	msg2,tmp1,tmp0
+	sha256_4_rounds_high	msg3,tmp0,tmp1
+
+	/* rounds 60-63 */
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+
+
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+
+
+	bgt	start_loop
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+
+	ret
+
+	.size	sha256_mb_ce_x1, .-sha256_mb_ce_x1
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
new file mode 100644
index 000000000..dadf44bb0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
@@ -0,0 +1,289 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+	ldr		key_q , [tmp]
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	add		tmp,tmp,16
+	add		l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	add		l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+	sha256h		l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+	sha256h		l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256su0		l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0,\tmp1
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+	sha256su1		l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l0_abcd_saved,2
+	declare_var_vector_reg	l0_efgh_saved,3
+	declare_var_vector_reg	l1_abcd,4
+	declare_var_vector_reg	l1_efgh,5
+	declare_var_vector_reg	l1_abcd_saved,6
+	declare_var_vector_reg	l1_efgh_saved,7
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,8
+	declare_var_vector_reg	l0_tmp1,9
+	declare_var_vector_reg	l0_tmp2,10
+	declare_var_vector_reg	l1_tmp0,11
+	declare_var_vector_reg	l1_tmp1,12
+	declare_var_vector_reg	l1_tmp2,13
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l1_msg0,20
+	declare_var_vector_reg	l1_msg1,21
+	declare_var_vector_reg	l1_msg2,22
+	declare_var_vector_reg	l1_msg3,23
+
+
+
+/*
+	void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	l1_job 	.req	x1
+	len	.req	w2
+	l0_data	.req	x3
+	l1_data	.req	x4
+	tmp	.req	x5
+	.global	sha256_mb_ce_x2
+	.type	sha256_mb_ce_x2, %function
+sha256_mb_ce_x2:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+	ldr	l1_data,   [l1_job]
+	ldr	l1_abcd_q, [l1_job, 64]
+	ldr	l1_efgh_q, [l1_job, 80]
+
+
+
+start_loop:
+
+	//load key addr
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	add	l1_data,l1_data,64
+	sub	len, len, #1
+	cmp	len, 0
+	//backup digest
+	mov	l0_abcd_saved_v.16b,l0_abcd_v.16b
+	mov	l0_efgh_saved_v.16b,l0_efgh_v.16b
+	mov	l1_abcd_saved_v.16b,l1_abcd_v.16b
+	mov	l1_efgh_saved_v.16b,l1_efgh_v.16b
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+	rev32	l1_msg0_v.16b,l1_msg0_v.16b
+	rev32	l1_msg1_v.16b,l1_msg1_v.16b
+	add	l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+	rev32	l1_msg2_v.16b,l1_msg2_v.16b
+	rev32	l1_msg3_v.16b,l1_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_high	msg1,tmp0,tmp1			/* rounds 48-51 */
+	sha256_4_rounds_high	msg2,tmp1,tmp0
+	sha256_4_rounds_high	msg3,tmp0,tmp1
+
+	/* rounds 60-63 */
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	sha256h		l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+
+
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+	add     l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+	add     l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+
+
+	bgt	start_loop
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+	str	l1_abcd_q,	[l1_job, 64]
+	str	l1_efgh_q, 	[l1_job, 80]
+
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.size	sha256_mb_ce_x2, .-sha256_mb_ce_x2
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
new file mode 100644
index 000000000..6ed1591ba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
@@ -0,0 +1,342 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+	ldr		key_q , [tmp]
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	mov		l2_tmp2_v.16b,l2_abcd_v.16b
+	add		tmp,tmp,16
+	add		l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	add		l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+	add		l2_\tmp1\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+	sha256h		l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+	sha256h		l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+	sha256h		l2_abcd_q,l2_efgh_q,l2_\tmp0\()_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+	sha256h2	l2_efgh_q,l2_tmp2_q,l2_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256su0		l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+	sha256su0		l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0,\tmp1
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+	sha256su1		l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+	sha256su1		l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l1_abcd,2
+	declare_var_vector_reg	l1_efgh,3
+	declare_var_vector_reg	l2_abcd,4
+	declare_var_vector_reg	l2_efgh,5
+	declare_var_vector_reg	l1_abcd_saved,16
+	declare_var_vector_reg	l1_efgh_saved,17
+	declare_var_vector_reg	l0_abcd_saved,20
+	declare_var_vector_reg	l0_efgh_saved,21
+	declare_var_vector_reg	l2_abcd_saved,24
+	declare_var_vector_reg	l2_efgh_saved,25
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,6
+	declare_var_vector_reg	l0_tmp1,7
+	declare_var_vector_reg	l0_tmp2,8
+	declare_var_vector_reg	l1_tmp0,9
+	declare_var_vector_reg	l1_tmp1,10
+	declare_var_vector_reg	l1_tmp2,11
+	declare_var_vector_reg	l2_tmp0,12
+	declare_var_vector_reg	l2_tmp1,13
+	declare_var_vector_reg	l2_tmp2,14
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l1_msg0,20
+	declare_var_vector_reg	l1_msg1,21
+	declare_var_vector_reg	l1_msg2,22
+	declare_var_vector_reg	l1_msg3,23
+	declare_var_vector_reg	l2_msg0,24
+	declare_var_vector_reg	l2_msg1,25
+	declare_var_vector_reg	l2_msg2,26
+	declare_var_vector_reg	l2_msg3,27
+
+
+
+/*
+	void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	l1_job 	.req	x1
+	l2_job 	.req	x2
+	len	.req	w3
+	l0_data	.req	x4
+	l1_data	.req	x5
+	l2_data	.req	x6
+	tmp	.req	x7
+	.global	sha256_mb_ce_x3
+	.type	sha256_mb_ce_x3, %function
+sha256_mb_ce_x3:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+	ldr	l1_data,   [l1_job]
+	ldr	l1_abcd_q, [l1_job, 64]
+	ldr	l1_efgh_q, [l1_job, 80]
+	ldr	l2_data,   [l2_job]
+	ldr	l2_abcd_q, [l2_job, 64]
+	ldr	l2_efgh_q, [l2_job, 80]
+
+
+
+start_loop:
+
+	//load key addr
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+	ld1	{l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	add	l1_data,l1_data,64
+	add	l2_data,l2_data,64
+	sub	len, len, #1
+	cmp	len, 0
+/*
+	//backup digest
+	mov	l0_abcd_saved_v.16b,l0_abcd_v.16b
+	mov	l0_efgh_saved_v.16b,l0_efgh_v.16b
+	mov	l1_abcd_saved_v.16b,l1_abcd_v.16b
+	mov	l1_efgh_saved_v.16b,l1_efgh_v.16b
+	mov	l2_abcd_saved_v.16b,l2_abcd_v.16b
+	mov	l2_efgh_saved_v.16b,l2_efgh_v.16b
+*/
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+	rev32	l1_msg0_v.16b,l1_msg0_v.16b
+	rev32	l1_msg1_v.16b,l1_msg1_v.16b
+	add	l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+	rev32	l1_msg2_v.16b,l1_msg2_v.16b
+	rev32	l1_msg3_v.16b,l1_msg3_v.16b
+
+	rev32	l2_msg0_v.16b,l2_msg0_v.16b
+	rev32	l2_msg1_v.16b,l2_msg1_v.16b
+	add	l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+	rev32	l2_msg2_v.16b,l2_msg2_v.16b
+	rev32	l2_msg3_v.16b,l2_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+
+
+	sha256_4_rounds_high	msg1,tmp0,tmp1			/* rounds 48-51 */
+
+	/* msg0 msg1 is free , share with digest regs */
+	ldr	l0_abcd_saved_q, [l0_job, 64]
+	ldr	l1_abcd_saved_q, [l1_job, 64]
+	ldr	l2_abcd_saved_q, [l2_job, 64]
+	ldr	l0_efgh_saved_q, [l0_job, 80]
+	ldr	l1_efgh_saved_q, [l1_job, 80]
+	ldr	l2_efgh_saved_q, [l2_job, 80]
+
+	sha256_4_rounds_high	msg2,tmp1,tmp0
+	sha256_4_rounds_high	msg3,tmp0,tmp1
+
+	/* rounds 60-63 */
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	sha256h		l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+	mov		l2_tmp2_v.16b,l2_abcd_v.16b
+	sha256h		l2_abcd_q,l2_efgh_q,l2_tmp1_v.4s
+	sha256h2	l2_efgh_q,l2_tmp2_q,l2_tmp1_v.4s
+
+	/* combine state */
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+	add     l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+	add     l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+	add     l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+	add     l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+	str	l1_abcd_q,	[l1_job, 64]
+	str	l1_efgh_q, 	[l1_job, 80]
+	str	l2_abcd_q,	[l2_job, 64]
+	str	l2_efgh_q, 	[l2_job, 80]
+
+	bgt	start_loop
+
+
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.size	sha256_mb_ce_x3, .-sha256_mb_ce_x3
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
new file mode 100644
index 000000000..b1686ada1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
@@ -0,0 +1,380 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+tmp0 : in
+tmp1 : out
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req
+	ldr		key_q , [tmp]
+	mov		tmp0_v.16b,l0_\tmp0\()_v.16b
+	mov		tmp1_v.16b,l1_\tmp0\()_v.16b
+	add		l0_\tmp0\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	add		l1_\tmp0\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+	mov		tmp2_v.16b,l0_abcd_v.16b
+	mov		tmp3_v.16b,l1_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,tmp0_v.4s
+	sha256h		l1_abcd_q,l1_efgh_q,tmp1_v.4s
+	sha256h2	l0_efgh_q,tmp2_q,tmp0_v.4s
+	sha256h2	l1_efgh_q,tmp3_q,tmp1_v.4s
+
+	ldr		key_q , [tmp]
+	mov		tmp0_v.16b,l2_\tmp0\()_v.16b
+	mov		tmp1_v.16b,l3_\tmp0\()_v.16b
+	add		tmp,tmp,16
+	add		l2_\tmp0\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+	add		l3_\tmp0\()_v.4s,l3_\msg\()_v.4s,key_v.4s
+	mov		tmp2_v.16b,l2_abcd_v.16b
+	mov		tmp3_v.16b,l3_abcd_v.16b
+	sha256h		l2_abcd_q,l2_efgh_q,tmp0_v.4s
+	sha256h		l3_abcd_q,l3_efgh_q,tmp1_v.4s
+	sha256h2	l2_efgh_q,tmp2_q,tmp0_v.4s
+	sha256h2	l3_efgh_q,tmp3_q,tmp1_v.4s
+
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256su0		l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+	sha256su0		l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+	sha256su0		l3_\msg0\()_v.4s,l3_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+	sha256su1		l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+	sha256su1		l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+	sha256su1		l3_\msg0\()_v.4s,l3_\msg2\()_v.4s,l3_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,15
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l1_abcd,2
+	declare_var_vector_reg	l1_efgh,3
+	declare_var_vector_reg	l2_abcd,4
+	declare_var_vector_reg	l2_efgh,5
+	declare_var_vector_reg	l3_abcd,6
+	declare_var_vector_reg	l3_efgh,7
+	declare_var_vector_reg	l1_abcd_saved,16
+	declare_var_vector_reg	l1_efgh_saved,17
+	declare_var_vector_reg	l0_abcd_saved,20
+	declare_var_vector_reg	l0_efgh_saved,21
+	declare_var_vector_reg	l2_abcd_saved,24
+	declare_var_vector_reg	l2_efgh_saved,25
+	declare_var_vector_reg	l3_abcd_saved,28
+	declare_var_vector_reg	l3_efgh_saved,29
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,8
+	declare_var_vector_reg	l1_tmp0,9
+	declare_var_vector_reg	l2_tmp0,10
+	declare_var_vector_reg	l3_tmp0,11
+
+	declare_var_vector_reg	tmp0,12
+	declare_var_vector_reg	tmp1,13
+	declare_var_vector_reg	tmp2,14
+	declare_var_vector_reg	tmp3,15
+
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l1_msg0,20
+	declare_var_vector_reg	l1_msg1,21
+	declare_var_vector_reg	l1_msg2,22
+	declare_var_vector_reg	l1_msg3,23
+	declare_var_vector_reg	l2_msg0,24
+	declare_var_vector_reg	l2_msg1,25
+	declare_var_vector_reg	l2_msg2,26
+	declare_var_vector_reg	l2_msg3,27
+	declare_var_vector_reg	l3_msg0,28
+	declare_var_vector_reg	l3_msg1,29
+	declare_var_vector_reg	l3_msg2,30
+	declare_var_vector_reg	l3_msg3,31
+
+
+
+/*
+	void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	l1_job 	.req	x1
+	l2_job 	.req	x2
+	l3_job 	.req	x3
+	len	.req	w4
+	l0_data	.req	x5
+	l1_data	.req	x6
+	l2_data	.req	x7
+	l3_data	.req	x8
+	tmp	.req	x9
+	.global	sha256_mb_ce_x4
+	.type	sha256_mb_ce_x4, %function
+sha256_mb_ce_x4:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+	ldr	l1_data,   [l1_job]
+	ldr	l1_abcd_q, [l1_job, 64]
+	ldr	l1_efgh_q, [l1_job, 80]
+	ldr	l2_data,   [l2_job]
+	ldr	l2_abcd_q, [l2_job, 64]
+	ldr	l2_efgh_q, [l2_job, 80]
+	ldr	l3_data,   [l3_job]
+	ldr	l3_abcd_q, [l3_job, 64]
+	ldr	l3_efgh_q, [l3_job, 80]
+
+
+
+start_loop:
+
+	//load key addr
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+	ld1	{l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+	ld1	{l3_msg0_v.4s-l3_msg3_v.4s},[l3_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	add	l1_data,l1_data,64
+	add	l2_data,l2_data,64
+	add	l3_data,l3_data,64
+	sub	len, len, #1
+	cmp	len, 0
+
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+	rev32	l1_msg0_v.16b,l1_msg0_v.16b
+	rev32	l1_msg1_v.16b,l1_msg1_v.16b
+	add	l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+	rev32	l1_msg2_v.16b,l1_msg2_v.16b
+	rev32	l1_msg3_v.16b,l1_msg3_v.16b
+
+	rev32	l2_msg0_v.16b,l2_msg0_v.16b
+	rev32	l2_msg1_v.16b,l2_msg1_v.16b
+	add	l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+	rev32	l2_msg2_v.16b,l2_msg2_v.16b
+	rev32	l2_msg3_v.16b,l2_msg3_v.16b
+
+	rev32	l3_msg0_v.16b,l3_msg0_v.16b
+	rev32	l3_msg1_v.16b,l3_msg1_v.16b
+	add	l3_tmp0_v.4s, l3_msg0_v.4s,key_v.4s
+	rev32	l3_msg2_v.16b,l3_msg2_v.16b
+	rev32	l3_msg3_v.16b,l3_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
+
+
+
+	sha256_4_rounds_high	msg1,tmp0			/* rounds 48-51 */
+
+	/* msg0 msg1 is free , share with digest regs */
+	ldr	l0_abcd_saved_q, [l0_job, 64]
+	ldr	l1_abcd_saved_q, [l1_job, 64]
+	ldr	l2_abcd_saved_q, [l2_job, 64]
+	ldr	l3_abcd_saved_q, [l3_job, 64]
+	ldr	l0_efgh_saved_q, [l0_job, 80]
+	ldr	l1_efgh_saved_q, [l1_job, 80]
+	ldr	l2_efgh_saved_q, [l2_job, 80]
+	ldr	l3_efgh_saved_q, [l3_job, 80]
+
+	sha256_4_rounds_high	msg2,tmp0
+	sha256_4_rounds_high	msg3,tmp0
+
+	/* rounds 60-63 */
+	mov		tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp0_v.4s
+	sha256h2	l0_efgh_q,tmp2_q,l0_tmp0_v.4s
+
+	mov		tmp2_v.16b,l1_abcd_v.16b
+	sha256h		l1_abcd_q,l1_efgh_q,l1_tmp0_v.4s
+	sha256h2	l1_efgh_q,tmp2_q,l1_tmp0_v.4s
+
+	mov		tmp2_v.16b,l2_abcd_v.16b
+	sha256h		l2_abcd_q,l2_efgh_q,l2_tmp0_v.4s
+	sha256h2	l2_efgh_q,tmp2_q,l2_tmp0_v.4s
+
+	mov		tmp2_v.16b,l3_abcd_v.16b
+	sha256h		l3_abcd_q,l3_efgh_q,l3_tmp0_v.4s
+	sha256h2	l3_efgh_q,tmp2_q,l3_tmp0_v.4s
+
+	/* combine state */
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+	add     l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+	add     l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+	add     l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+	add     l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+	add     l3_abcd_v.4s,l3_abcd_v.4s,l3_abcd_saved_v.4s
+	add     l3_efgh_v.4s,l3_efgh_v.4s,l3_efgh_saved_v.4s
+
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+	str	l1_abcd_q,	[l1_job, 64]
+	str	l1_efgh_q, 	[l1_job, 80]
+	str	l2_abcd_q,	[l2_job, 64]
+	str	l2_efgh_q, 	[l2_job, 80]
+	str	l3_abcd_q,	[l3_job, 64]
+	str	l3_efgh_q, 	[l3_job, 80]
+
+	bgt	start_loop
+
+
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.size	sha256_mb_ce_x4, .-sha256_mb_ce_x4
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
new file mode 100644
index 000000000..12441a8e3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx_slver_02020154;
+struct slver sha256_ctx_mgr_init_avx_slver = { 0x0154, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_avx_slver_02020155;
+struct slver sha256_ctx_mgr_submit_avx_slver = { 0x0155, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_avx_slver_02020156;
+struct slver sha256_ctx_mgr_flush_avx_slver = { 0x0156, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
new file mode 100644
index 000000000..9c045659e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx2(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					    const void *buffer, uint32_t len,
+					    HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx2(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx2_slver_04020157;
+struct slver sha256_ctx_mgr_init_avx2_slver = { 0x0157, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_submit_avx2_slver_04020158;
+struct slver sha256_ctx_mgr_submit_avx2_slver = { 0x0158, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_flush_avx2_slver_04020159;
+struct slver sha256_ctx_mgr_flush_avx2_slver = { 0x0159, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
new file mode 100644
index 000000000..a1f068987
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
@@ -0,0 +1,273 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					      const void *buffer, uint32_t len,
+					      HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_slver_0600015a;
+struct slver sha256_ctx_mgr_init_avx512_slver = { 0x015a, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_submit_avx512_slver_0600015b;
+struct slver sha256_ctx_mgr_submit_avx512_slver = { 0x015b, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_flush_avx512_slver_0600015c;
+struct slver sha256_ctx_mgr_flush_avx512_slver = { 0x015c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
new file mode 100644
index 000000000..763057f12
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
@@ -0,0 +1,283 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+/**
+ *  sha256_ctx_avx512_ni related functions are aiming to utilize Canon Lake.
+ *  Since SHANI is still slower than multibuffer for full lanes,
+ *  sha256_ctx_mgr_init_avx512_ni and sha256_ctx_mgr_submit_avx512_ni are
+ *  similare with their avx512 versions.
+ *  sha256_ctx_mgr_flush_avx512_ni is different. It will call
+ *  sha256_mb_mgr_flush_avx512_ni which would use shani when lanes are less
+ *  than a threshold.
+ *
+ */
+#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI)
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512_ni(SHA256_HASH_CTX_MGR * mgr,
+						 SHA256_HASH_CTX * ctx, const void *buffer,
+						 uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512_ni(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_ni_slver_080002ca;
+struct slver sha256_ctx_mgr_init_avx512_ni_slver = { 0x02ca, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver_080002cb;
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver = { 0x02cb, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver_080002cc;
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver = { 0x02cc, 0x00, 0x08 };
+
+#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
new file mode 100644
index 000000000..58bf024a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
@@ -0,0 +1,301 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be32(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sha256_single(const void *data, uint32_t digest[]);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+
+void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					    const void *buffer, uint32_t len,
+					    HASH_CTX_FLAG flags)
+{
+	uint32_t remain_len;
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+		// Cannot submit a new entire job to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags == HASH_FIRST) {
+
+		sha256_init(ctx, buffer, len);
+		sha256_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_UPDATE) {
+		sha256_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_LAST) {
+		remain_len = sha256_update(ctx, buffer, len);
+		sha256_final(ctx, remain_len);
+	}
+
+	if (flags == HASH_ENTIRE) {
+		sha256_init(ctx, buffer, len);
+		remain_len = sha256_update(ctx, buffer, len);
+		sha256_final(ctx, remain_len);
+	}
+
+	return ctx;
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr)
+{
+	return NULL;
+}
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	// Init digest
+	hash_init_digest(ctx->job.result_digest);
+
+	// Reset byte counter
+	ctx->total_length = 0;
+
+	// Clear extra blocks
+	ctx->partial_block_buffer_length = 0;
+
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Mark it as processing
+	ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	uint32_t remain_len = len;
+	uint32_t *digest = ctx->job.result_digest;
+
+	while (remain_len >= SHA256_BLOCK_SIZE) {
+		sha256_single(buffer, digest);
+		buffer = (void *)((uint8_t *) buffer + SHA256_BLOCK_SIZE);
+		remain_len -= SHA256_BLOCK_SIZE;
+		ctx->total_length += SHA256_BLOCK_SIZE;
+	}
+	ctx->status = HASH_CTX_STS_IDLE;
+	ctx->incoming_buffer = buffer;
+	return remain_len;
+}
+
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len)
+{
+	const void *buffer = ctx->incoming_buffer;
+	uint32_t i = remain_len, j;
+	uint8_t buf[2 * SHA256_BLOCK_SIZE];
+	uint32_t *digest = ctx->job.result_digest;
+
+	ctx->total_length += i;
+	memcpy(buf, buffer, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA256_BLOCK_SIZE;
+	else
+		i = SHA256_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+	sha256_single(buf, digest);
+	if (i == 2 * SHA256_BLOCK_SIZE) {
+		sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+	}
+
+	ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha256_single(const void *data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2;
+	uint32_t w[16];
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+	step(1, h, a, b, c, d, e, f, g, 0x71374491);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+	step(31, b, c, d, e, f, g, h, a, 0x14292967);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+	step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+	step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_base_slver_000002f0;
+struct slver sha256_ctx_mgr_init_base_slver = { 0x02f0, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_base_slver_000002f1;
+struct slver sha256_ctx_mgr_submit_base_slver = { 0x02f1, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_base_slver_000002f2;
+struct slver sha256_ctx_mgr_flush_base_slver = { 0x02f2, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
new file mode 100644
index 000000000..1483f631c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr,
+						   SHA256_HASH_CTX * ctx, const void *buffer,
+						   uint32_t len, HASH_CTX_FLAG flags);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr);
+
+void sha256_ctx_mgr_init(SHA256_HASH_CTX_MGR * mgr)
+{
+	return sha256_ctx_mgr_init_base(mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+				       const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	return sha256_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush(SHA256_HASH_CTX_MGR * mgr)
+{
+	return sha256_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
new file mode 100644
index 000000000..f85f5c88b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+									   &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_slver_00020151;
+struct slver sha256_ctx_mgr_init_sse_slver = { 0x0151, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_sse_slver_00020152;
+struct slver sha256_ctx_mgr_submit_sse_slver = { 0x0152, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_sse_slver_00020153;
+struct slver sha256_ctx_mgr_flush_sse_slver = { 0x0153, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
new file mode 100644
index 000000000..e2c7e2738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
@@ -0,0 +1,262 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_SHANI
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	// Same with sse
+	sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse_ni(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					      const void *buffer, uint32_t len,
+					      HASH_CTX_FLAG flags)
+{
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse_ni(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_ni_slver_070002c7;
+struct slver sha256_ctx_mgr_init_sse_ni_slver = { 0x02c7, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_submit_sse_ni_slver_070002c8;
+struct slver sha256_ctx_mgr_submit_sse_ni_slver = { 0x02c8, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_flush_sse_ni_slver_070002c9;
+struct slver sha256_ctx_mgr_flush_sse_ni_slver = { 0x02c9, 0x00, 0x07 };
+
+#endif // HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
new file mode 100644
index 000000000..f9fb6d230
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
@@ -0,0 +1,65 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN		0
+%define STS_BEING_PROCESSED	1
+%define STS_COMPLETED		2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SHA256_SB_THRESHOLD_SSE		1
+%define SHA256_SB_THRESHOLD_AVX		1
+%define SHA256_SB_THRESHOLD_AVX2	1
+%define SHA256_SB_THRESHOLD_AVX512	1
+%define SHA256_NI_SB_THRESHOLD_SSE	4 ; shani is faster than sse sha256_mb
+%define SHA256_NI_SB_THRESHOLD_AVX512	6
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS	; SHA256_JOB
+
+;;;	name				size	align
+FIELD	_buffer,			8,	8	; pointer to buffer
+FIELD	_len,				8,	8	; length in bytes
+FIELD	_result_digest,			8*4,	64	; Digest (output)
+FIELD	_status,			4,	4
+FIELD	_user_data,			8,	8
+
+%assign _SHA256_JOB_size	_FIELD_OFFSET
+%assign _SHA256_JOB_align	_STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
new file mode 100644
index 000000000..28f1f5118
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
@@ -0,0 +1,146 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS (SHA256_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SHA256_HASH_CTX_MGR * mgr)
+{
+	static int32_t last_lens[SHA256_MAX_LANES] = { 0 };
+	int32_t len;
+	uint8_t num_unchanged = 0;
+	int i;
+	for (i = 0; i < SHA256_MAX_LANES; i++) {
+		len = (int32_t) mgr->mgr.lens[i];
+		// len[i] in mgr consists of byte_length<<4 | lane_index
+		len = (len >= 16) ? (len >> 4 << 6) : 0;
+		printf("\t%d", len);
+		if (last_lens[i] > 0 && last_lens[i] == len)
+			num_unchanged += 1;
+		last_lens[i] = len;
+	}
+	printf("\n");
+	return num_unchanged;
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	uint8_t num_ret, num_unchanged = 0;
+	int ret;
+
+	printf("sha256_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		lens[i] = TEST_LEN / SHA256_MAX_LANES * (i + 1);
+		bufs[i] = (unsigned char *)malloc(lens[i]);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], lens[i]);
+	}
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// Run sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	printf("Changes of lens inside mgr:\n");
+	lens_print_and_check(mgr);
+	while (sha256_ctx_mgr_flush(mgr)) {
+		num_ret = lens_print_and_check(mgr);
+		num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+	}
+	printf("Info of sha256_mb lens prints over\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else if (num_unchanged)
+		printf("SHA-NI is used when %d or %d jobs are uncompleted\n",
+		       num_unchanged, num_unchanged + 1);
+	else
+		printf("SHA-NI is not used, or used for last job\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..ebba9ca36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; LANE_DATA
+;;;     name            size    align
+FIELD   _job_in_lane,   8,      8       ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align        _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; SHA256_ARGS_X16
+;;;     name            size    align
+FIELD   _digest,        4*8*16,  4       ; transposed digest
+FIELD   _data_ptr,      8*16,    8       ; array of pointers to data
+END_FIELDS
+
+%assign _SHA256_ARGS_X4_size    _FIELD_OFFSET
+%assign _SHA256_ARGS_X4_align   _STRUCT_ALIGN
+%assign _SHA256_ARGS_X8_size	_FIELD_OFFSET
+%assign _SHA256_ARGS_X8_align	_STRUCT_ALIGN
+%assign _SHA256_ARGS_X16_size	_FIELD_OFFSET
+%assign _SHA256_ARGS_X16_align	_STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; MB_MGR
+;;;     name            size    align
+FIELD   _args,          _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD   _lens,          4*16,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*16, _LANE_DATA_align
+FIELD   _num_lanes_inuse, 4,    4
+END_FIELDS
+
+%assign _MB_MGR_size    _FIELD_OFFSET
+%assign _MB_MGR_align   _STRUCT_ALIGN
+
+_args_digest    equ     _args + _digest
+_args_data_ptr  equ     _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..69f27f42d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
@@ -0,0 +1,253 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx, function
+sha256_mb_mgr_flush_avx:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX
+	ja	mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x1000	; avx has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha256_mb_x4_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	vmovd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..0ee0589cf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
@@ -0,0 +1,274 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sha256_mb_x8_avx2 and sha256_opt_x1
+%define idx             rbp
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx2(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx2, function
+sha256_mb_mgr_flush_avx2:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [one]
+	cmp	qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [two]
+	cmp	qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [three]
+	cmp	qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [four]
+	cmp	qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [five]
+	cmp	qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [six]
+	cmp	qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [seven]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX2
+	ja	mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x2000	; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x8_avx2
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*8]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*8]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+four:	dq  4
+five:	dq  5
+six:	dq  6
+seven:	dq  7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..201cd42b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
@@ -0,0 +1,288 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx             rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512, function
+sha256_mb_mgr_flush_avx512:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+%assign I 1
+%rep 15
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX512
+	ja	mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x4000	; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x16_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_flush_avx512
+no_sha256_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
new file mode 100644
index 000000000..7bc9d32a4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
@@ -0,0 +1,295 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+
+extern sha256_mb_x16_avx512
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx             rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512_ni, function
+sha256_mb_mgr_flush_avx512_ni:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp     num_lanes_inuse, 0
+	jz      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+%assign I 1
+%rep 15
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1  ; ymm3 has {x,x, x, x,x,x, x,C3}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov     len2, idx
+	and     idx, 0xF
+	shr     len2, 4
+	jz      len_is_0
+
+	; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+	cmp     dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_AVX512
+	ja      mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov     r10, idx
+	or      r10, 0x4000     ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_ni_x1
+	; state and idx are intact
+	jmp     len_is_0
+
+mb_processing:
+
+	vpand   ymm2, ymm2, [rel clear_low_nibble]
+	vpshufd ymm2, ymm2, 0
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha256_mb_x16_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	sub     num_lanes_inuse, 1
+	mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	vmovd   xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd   xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
+ %else
+  %ifidn __OUTPUT_FORMAT__, win64
+   global no_sha256_mb_mgr_flush_avx512_ni
+   no_sha256_mb_mgr_flush_avx512_ni:
+  %endif
+ %endif ; HAVE_AS_KNOWS_SHANI
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_avx512_ni
+  no_sha256_mb_mgr_flush_avx512_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..69ae4bad5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern  sha256_mb_x4_sse
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse, function
+sha256_mb_mgr_flush_sse:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_SSE
+	ja	mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x1000	; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
new file mode 100644
index 000000000..43b8fcbe4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern  sha256_mb_x4_sse
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse_ni, function
+sha256_mb_mgr_flush_sse_ni:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp     dword [state + _num_lanes_inuse], 0
+	jz      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+	cmp     dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_SSE
+	ja      mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov     r10, idx
+	or      r10, 0x1000     ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_ni_x1
+	; state and idx are intact
+	jmp     len_is_0
+
+mb_processing:
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+  global no_sha256_mb_mgr_flush_sse_ni
+  no_sha256_mb_mgr_flush_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..903fb733b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx2(SHA256_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF76543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA256_X8_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..b875735f9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx512(SHA256_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xfedcba9876543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA256_MAX_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
new file mode 100644
index 000000000..cf22c4aee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF3210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA256_MIN_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..cb7d5790a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
@@ -0,0 +1,260 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       16*10
+%define _GPR_SAVE       8*5
+%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx, function
+sha256_mb_mgr_submit_avx:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqa  [rsp + 16*0], xmm6
+	vmovdqa  [rsp + 16*1], xmm7
+	vmovdqa  [rsp + 16*2], xmm8
+	vmovdqa  [rsp + 16*3], xmm9
+	vmovdqa  [rsp + 16*4], xmm10
+	vmovdqa  [rsp + 16*5], xmm11
+	vmovdqa  [rsp + 16*6], xmm12
+	vmovdqa  [rsp + 16*7], xmm13
+	vmovdqa  [rsp + 16*8], xmm14
+	vmovdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	vmovdqa	xmm0, [job + _result_digest + 0*16]
+	vmovdqa	xmm1, [job + _result_digest + 1*16]
+	vmovd    [state + _args_digest + 4*lane + 0*16], xmm0
+	vpextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	vpextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	vpextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	vmovd    [state + _args_digest + 4*lane + 4*16], xmm1
+	vpextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
+	vpextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
+	vpextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha256_mb_x4_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub	dword [state + _num_lanes_inuse], 1
+
+	vmovd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	vmovd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + 16*0]
+	vmovdqa  xmm7,  [rsp + 16*1]
+	vmovdqa  xmm8,  [rsp + 16*2]
+	vmovdqa  xmm9,  [rsp + 16*3]
+	vmovdqa  xmm10, [rsp + 16*4]
+	vmovdqa  xmm11, [rsp + 16*5]
+	vmovdqa  xmm12, [rsp + 16*6]
+	vmovdqa  xmm13, [rsp + 16*7]
+	vmovdqa  xmm14, [rsp + 16*8]
+	vmovdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..af2fc89ea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define idx             r8
+%define last_len        r8
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+
+%define tmp             r9
+
+%define lane_data       r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx2(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx2, function
+sha256_mb_mgr_submit_avx2:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	and	lane, 0xF
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	mov	[lane_data + _job_in_lane], job
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovdqu xmm1, [job + _result_digest + 1*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*8], xmm0
+	vpextrd	[state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+	vpextrd	[state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+	vpextrd	[state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+	vmovd	[state + _args_digest + 4*lane + 4*4*8], xmm1
+	vpextrd	[state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+	vpextrd	[state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+	vpextrd	[state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp	unused_lanes, 0xf
+	jne	return_null
+
+start_loop:
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x8_avx2
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub	dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*8]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*8]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..cdc477370
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define idx             r8
+%define last_len        r8
+%define p               r11
+%define start_offset    r11
+%define num_lanes_inuse r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+
+%define tmp             r9
+
+%define lane_data       r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx512(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx512, function
+sha256_mb_mgr_submit_avx512:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	and	lane, 0xF
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	mov	[lane_data + _job_in_lane], job
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovdqu xmm1, [job + _result_digest + 1*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*16], xmm0
+	vpextrd	[state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+	vpextrd	[state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+	vpextrd	[state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+	vmovd	[state + _args_digest + 4*lane + 4*4*16], xmm1
+	vpextrd	[state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+	vpextrd	[state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+	vpextrd	[state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        add     num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+        cmp     num_lanes_inuse, 16
+	jne	return_null
+
+start_loop:
+	; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+        vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x16_avx512
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_submit_avx512
+no_sha256_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..b1bbc7002
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern  sha256_mb_x4_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       16*10
+%define _GPR_SAVE       8*5
+%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse, function
+sha256_mb_mgr_submit_sse:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	movdqa  [rsp + 16*0], xmm6
+	movdqa  [rsp + 16*1], xmm7
+	movdqa  [rsp + 16*2], xmm8
+	movdqa  [rsp + 16*3], xmm9
+	movdqa  [rsp + 16*4], xmm10
+	movdqa  [rsp + 16*5], xmm11
+	movdqa  [rsp + 16*6], xmm12
+	movdqa  [rsp + 16*7], xmm13
+	movdqa  [rsp + 16*8], xmm14
+	movdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa	xmm0, [job + _result_digest + 0*16]
+	movdqa	xmm1, [job + _result_digest + 1*16]
+	movd    [state + _args_digest + 4*lane + 0*16], xmm0
+	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	movd    [state + _args_digest + 4*lane + 4*16], xmm1
+	pextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
+	pextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
+	pextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub	dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + 16*0]
+	movdqa  xmm7,  [rsp + 16*1]
+	movdqa  xmm8,  [rsp + 16*2]
+	movdqa  xmm9,  [rsp + 16*3]
+	movdqa  xmm10, [rsp + 16*4]
+	movdqa  xmm11, [rsp + 16*5]
+	movdqa  xmm12, [rsp + 16*6]
+	movdqa  xmm13, [rsp + 16*7]
+	movdqa  xmm14, [rsp + 16*8]
+	movdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
new file mode 100644
index 000000000..cb1dce641
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
@@ -0,0 +1,301 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern  sha256_mb_x4_sse
+extern  sha256_ni_x2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       16*10
+%define _GPR_SAVE       8*7
+%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse_ni(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse_ni, function
+sha256_mb_mgr_submit_sse_ni:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+	mov     [rsp + _XMM_SAVE + 8*5], r13
+	mov     [rsp + _XMM_SAVE + 8*6], r14
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	movdqa  [rsp + 16*0], xmm6
+	movdqa  [rsp + 16*1], xmm7
+	movdqa  [rsp + 16*2], xmm8
+	movdqa  [rsp + 16*3], xmm9
+	movdqa  [rsp + 16*4], xmm10
+	movdqa  [rsp + 16*5], xmm11
+	movdqa  [rsp + 16*6], xmm12
+	movdqa  [rsp + 16*7], xmm13
+	movdqa  [rsp + 16*8], xmm14
+	movdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl     len, 4
+	or      len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa  xmm0, [job + _result_digest + 0*16]
+	movdqa  xmm1, [job + _result_digest + 1*16]
+	movd    [state + _args_digest + 4*lane + 0*16], xmm0
+	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	movd    [state + _args_digest + 4*lane + 4*16], xmm1
+	pextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
+	pextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
+	pextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+
+	cmp     unused_lanes, 0xF32	; we will process two jobs at the same time
+	jne 	return_null		; wait for another sha_ni job
+
+	; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
+  %if SHA256_NI_SB_THRESHOLD_SSE >= 4   ; there are 4 lanes in sse mb
+  ; shani glue code
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+	; lensN-len2=idx
+	sub     lens0, len2
+	sub     lens1, len2
+
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     r10, idx
+	or      r10, 0x1000     ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_ni_x2
+	; state and idx are intact
+  %else
+  ; original mb code
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+    start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+  %endif
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + 16*0]
+	movdqa  xmm7,  [rsp + 16*1]
+	movdqa  xmm8,  [rsp + 16*2]
+	movdqa  xmm9,  [rsp + 16*3]
+	movdqa  xmm10, [rsp + 16*4]
+	movdqa  xmm11, [rsp + 16*5]
+	movdqa  xmm12, [rsp + 16*6]
+	movdqa  xmm13, [rsp + 16*7]
+	movdqa  xmm14, [rsp + 16*8]
+	movdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov     r13, [rsp + _XMM_SAVE + 8*5]
+	mov     r14, [rsp + _XMM_SAVE + 8*6]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+  global no_sha256_mb_mgr_submit_sse_ni
+  no_sha256_mb_mgr_submit_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
new file mode 100644
index 000000000..768bfca78
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	int ret;
+
+	printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// SSL test
+		SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+
+		// sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha256_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha256_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Random buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run SSL test
+			SHA256(bufs[i], lens[i], digest_ssl[i]);
+
+			// Run sb_sha256 test
+			sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha256_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_be32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_ssl rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
new file mode 100644
index 000000000..adba77f3d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
@@ -0,0 +1,203 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	uint8_t *tmp_buf;
+	int ret;
+
+	printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+		// Run sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha256_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail) {
+		printf("Test failed function check %d\n", fail);
+		return fail;
+	}
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha256_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Use buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run reference test
+			sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+			// Run sha256_mb test
+			sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha256_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail "
+					       "0x%08X <=> 0x%08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	// Test at the end of buffer
+	jobs = rand() % TEST_BUFS;
+	tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+	if (!tmp_buf) {
+		printf("malloc failed, end test aborted.\n");
+		return 1;
+	}
+
+	rand_buffer(tmp_buf, jobs);
+
+	sha256_ctx_mgr_init(mgr);
+
+	// Extend to the end of allocated buffer to construct jobs
+	for (i = 0; i < jobs; i++) {
+		bufs[i] = (uint8_t *) & tmp_buf[i];
+		lens[i] = jobs - i;
+
+		// Reference test
+		sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	while (sha256_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < jobs; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("End test failed at offset %d - result: 0x%08X"
+				       ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	putchar('.');
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256 rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
new file mode 100644
index 000000000..9535d80df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
@@ -0,0 +1,300 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE		13*SHA256_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS 	(TEST_LEN/(16*SHA256_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, fail = 0;
+	int len_done, len_rem, len_rand;
+	unsigned char *bufs[TEST_BUFS];
+	unsigned char *buf_ptr[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int joblen, jobs, t;
+	int ret;
+
+	printf("multibinary_sha256_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocte and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		buf_ptr[i] = bufs[i];
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+	}
+
+	// Run sb_sha256 tests
+	for (i = 0; i < TEST_BUFS;) {
+		len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_done == 0)
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+		else if (len_rem <= UPDATE_SIZE)
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		// Add jobs while available or finished
+		if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+			i++;
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+	}
+
+	// Start flushing finished jobs, end on last flushed
+	ctx = sha256_ctx_mgr_flush(mgr);
+	while (ctx) {
+		if (hash_ctx_complete(ctx)) {
+			debug_char('-');
+			ctx = sha256_ctx_mgr_flush(mgr);
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+
+		len_done = (int)((unsigned long)buf_ptr[i]
+				 - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_rem <= UPDATE_SIZE)
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		if (ctx == NULL)
+			ctx = sha256_ctx_mgr_flush(mgr);
+	}
+
+	// Check digests
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		for (i = 0; i < jobs; i++) {
+			joblen = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], joblen);
+			lens[i] = joblen;
+			buf_ptr[i] = bufs[i];
+			sha256_ref(bufs[i], digest_ref[i], lens[i]);
+		}
+
+		sha256_ctx_mgr_init(mgr);
+
+		// Run sha256_sb jobs
+		i = 0;
+		while (i < jobs) {
+			// Submit a new job
+			len_rand = SHA256_BLOCK_SIZE +
+			    SHA256_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+			if (lens[i] > len_rand)
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_FIRST);
+			else
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], lens[i], HASH_ENTIRE);
+
+			// Returned ctx could be:
+			//  - null context (we are just getting started and lanes aren't full yet), or
+			//  - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+			//  - an unfinished ctx, we will resubmit
+
+			if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+				i++;
+				continue;
+			} else {
+				// unfinished ctx returned, choose another random update length and submit either
+				// UPDATE or LAST depending on the amount of buffer remaining
+				while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+					j = (unsigned long)(ctx->user_data);	// Get index of the returned ctx
+					buf_ptr[j] = bufs[j] + ctx->total_length;
+					len_rand = (rand() % SHA256_BLOCK_SIZE)
+					    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+					len_rem = lens[j] - ctx->total_length;
+
+					if (len_rem <= len_rand)	// submit the rest of the job as LAST
+						ctx = sha256_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rem,
+									    HASH_LAST);
+					else	// submit the random update length as UPDATE
+						ctx = sha256_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rand,
+									    HASH_UPDATE);
+				}	// Either continue submitting any contexts returned here as UPDATE/LAST, or
+				// go back to submitting new jobs using the index i.
+
+				i++;
+			}
+		}
+
+		// Start flushing finished jobs, end on last flushed
+		ctx = sha256_ctx_mgr_flush(mgr);
+		while (ctx) {
+			if (hash_ctx_complete(ctx)) {
+				debug_char('-');
+				ctx = sha256_ctx_mgr_flush(mgr);
+				continue;
+			}
+			// Resubmit unfinished job
+			i = (unsigned long)(ctx->user_data);
+			buf_ptr[i] = bufs[i] + ctx->total_length;	// update buffer pointer
+			len_rem = lens[i] - ctx->total_length;
+			len_rand = (rand() % SHA256_BLOCK_SIZE)
+			    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+			debug_char('+');
+			if (len_rem <= len_rand)
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rem, HASH_LAST);
+			else
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_UPDATE);
+
+			if (ctx == NULL)
+				ctx = sha256_ctx_mgr_flush(mgr);
+		}
+
+		// Check result digest
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail %8X <=> %8X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_update rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
new file mode 100644
index 000000000..8a5b5a9b2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
@@ -0,0 +1,241 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha256_mb.h"
+
+typedef uint32_t DigestSHA256[SHA256_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static uint8_t msg3[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<";
+static uint8_t msg4[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static uint8_t msg5[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static uint8_t msg6[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static uint8_t msg7[] = "";
+
+static DigestSHA256 expResultDigest1 = { 0x248D6A61, 0xD20638B8, 0xE5C02693, 0x0C3E6039,
+	0xA33CE459, 0x64FF2167, 0xF6ECEDD4, 0x19DB06C1
+};
+
+static DigestSHA256 expResultDigest2 = { 0xD9C2E699, 0x586B948F, 0x4022C799, 0x4FFE14C6,
+	0x3A4E8E31, 0x2EE2AEE1, 0xEBE51BED, 0x85705CFD
+};
+
+static DigestSHA256 expResultDigest3 = { 0xE3057651, 0x81295681, 0x7ECF1791, 0xFF9A1619,
+	0xB2BC5CAD, 0x2AC00018, 0x92AE489C, 0x48DD10B3
+};
+
+static DigestSHA256 expResultDigest4 = { 0x0307DAA3, 0x7130A140, 0x270790F9, 0x95B71407,
+	0x8EC752A6, 0x084EC1F3, 0xBD873D79, 0x3FF78383
+};
+
+static DigestSHA256 expResultDigest5 = { 0x679312F7, 0x2E18D599, 0x5F51BDC6, 0x4ED56AFD,
+	0x9B5704D3, 0x4387E11C, 0xC2331089, 0x2CD45DAA
+};
+
+static DigestSHA256 expResultDigest6 = { 0x8B1767E9, 0x7BA7BBE5, 0xF9A6E8D9, 0x9996904F,
+	0x3AF6562E, 0xA58AF438, 0x5D8D584B, 0x81C808CE
+};
+
+static DigestSHA256 expResultDigest7 = { 0xE3B0C442, 0x98FC1C14, 0x9AFBF4C8, 0x996FB924,
+	0x27AE41E4, 0x649B934C, 0xA495991B, 0x7852B855
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+	expResultDigest1, expResultDigest2, expResultDigest3,
+	expResultDigest4, expResultDigest5, expResultDigest6,
+	expResultDigest7
+};
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint32_t *good;
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = sha256_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+
+		}
+	}
+
+	while (1) {
+		ctx = sha256_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+		ctx = sha256_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+		}
+	}
+	while (1) {
+		ctx = sha256_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_sha256 test: Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..51759d7a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   4000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   20
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sha256_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha256_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sha256_ctx_mgr_submit(mgr,
+					      &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+		while (sha256_ctx_mgr_flush(mgr)) ;
+	}
+	perf_stop(&stop);
+
+	printf("multibinary_sha256" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+
+	printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..235ec74a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,132 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SHA256_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   10000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	uint32_t nlanes;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sha256_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha256_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb shortage tests
+	for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+		perf_start(&start);
+		for (t = 0; t < TEST_LOOPS; t++) {
+			for (i = 0; i < nlanes; i++)
+				sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+						      HASH_ENTIRE);
+
+			while (sha256_ctx_mgr_flush(mgr)) ;
+		}
+		perf_stop(&stop);
+
+		printf("multibinary_sha256" TEST_TYPE_STR " with %d lanes: ", nlanes);
+		perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+		for (i = 0; i < nlanes; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_be32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+	}
+
+	printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
new file mode 100644
index 000000000..f45669c6e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
@@ -0,0 +1,930 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX512
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers:  rax rbx     rdx rsi rdi        r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp r8
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+; Define Stack Layout
+START_FIELDS
+;;;     name            size    align
+FIELD	_DIGEST_SAVE,	8*64,	64
+FIELD	_rsp,		8,	8
+%assign STACK_SPACE	_FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx	; arg0 preserved
+   %define arg2 rdx	; arg1
+   %define reg3 r8	; arg2 preserved
+   %define reg4 r9	; arg3
+   %define var1 rdi
+   %define var2 rsi
+   %define local_func_decl(func_name) global func_name
+ %else
+   %define arg1 rdi	; arg0
+   %define arg2 rsi	; arg1
+   %define var1 rdx	; arg2
+   %define var2 rcx	; arg3
+   %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state    arg1
+%define num_blks arg2
+
+%define	IN	(state + _data_ptr)
+%define DIGEST	state
+%define SIZE	num_blks
+
+%define IDX  var1
+%define TBL  var2
+
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define F	zmm5
+%define G	zmm6
+%define H	zmm7
+%define T1	zmm8
+%define TMP0	zmm9
+%define TMP1	zmm10
+%define TMP2	zmm11
+%define TMP3	zmm12
+%define TMP4	zmm13
+%define TMP5	zmm14
+%define TMP6	zmm15
+
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+
+%define inp0	r9
+%define inp1	r10
+%define inp2	r11
+%define inp3	r12
+%define inp4	r13
+%define inp5	r14
+%define inp6	r15
+%define inp7	rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0  = {a15 a14 a13 a12   a11 a10 a9 a8   a7 a6 a5 a4   a3 a2 a1 a0}
+; r1  = {b15 b14 b13 b12   b11 b10 b9 b8   b7 b6 b5 b4   b3 b2 b1 b0}
+; r2  = {c15 c14 c13 c12   c11 c10 c9 c8   c7 c6 c5 c4   c3 c2 c1 c0}
+; r3  = {d15 d14 d13 d12   d11 d10 d9 d8   d7 d6 d5 d4   d3 d2 d1 d0}
+; r4  = {e15 e14 e13 e12   e11 e10 e9 e8   e7 e6 e5 e4   e3 e2 e1 e0}
+; r5  = {f15 f14 f13 f12   f11 f10 f9 f8   f7 f6 f5 f4   f3 f2 f1 f0}
+; r6  = {g15 g14 g13 g12   g11 g10 g9 g8   g7 g6 g5 g4   g3 g2 g1 g0}
+; r7  = {h15 h14 h13 h12   h11 h10 h9 h8   h7 h6 h5 h4   h3 h2 h1 h0}
+; r8  = {i15 i14 i13 i12   i11 i10 i9 i8   i7 i6 i5 i4   i3 i2 i1 i0}
+; r9  = {j15 j14 j13 j12   j11 j10 j9 j8   j7 j6 j5 j4   j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12   k11 k10 k9 k8   k7 k6 k5 k4   k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12   l11 l10 l9 l8   l7 l6 l5 l4   l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12   m11 m10 m9 m8   m7 m6 m5 m4   m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12   n11 n10 n9 n8   n7 n6 n5 n4   n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12   o11 o10 o9 o8   o7 o6 o5 o4   o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12   p11 p10 p9 p8   p7 p6 p5 p4   p3 p2 p1 p0}
+
+; r0   = {p0  o0  n0  m0    l0  k0  j0  i0    h0  g0  f0  e0    d0  c0  b0  a0}
+; r1   = {p1  o1  n1  m1    l1  k1  j1  i1    h1  g1  f1  e1    d1  c1  b1  a1}
+; r2   = {p2  o2  n2  m2    l2  k2  j2  i2    h2  g2  f2  e2    d2  c2  b2  a2}
+; r3   = {p3  o3  n3  m3    l3  k3  j3  i3    h3  g3  f3  e3    d3  c3  b3  a3}
+; r4   = {p4  o4  n4  m4    l4  k4  j4  i4    h4  g4  f4  e4    d4  c4  b4  a4}
+; r5   = {p5  o5  n5  m5    l5  k5  j5  i5    h5  g5  f5  e5    d5  c5  b5  a5}
+; r6   = {p6  o6  n6  m6    l6  k6  j6  i6    h6  g6  f6  e6    d6  c6  b6  a6}
+; r7   = {p7  o7  n7  m7    l7  k7  j7  i7    h7  g7  f7  e7    d7  c7  b7  a7}
+; r8   = {p8  o8  n8  m8    l8  k8  j8  i8    h8  g8  f8  e8    d8  c8  b8  a8}
+; r9   = {p9  o9  n9  m9    l9  k9  j9  i9    h9  g9  f9  e9    d9  c9  b9  a9}
+; r10  = {p10 o10 n10 m10   l10 k10 j10 i10   h10 g10 f10 e10   d10 c10 b10 a10}
+; r11  = {p11 o11 n11 m11   l11 k11 j11 i11   h11 g11 f11 e11   d11 c11 b11 a11}
+; r12  = {p12 o12 n12 m12   l12 k12 j12 i12   h12 g12 f12 e12   d12 c12 b12 a12}
+; r13  = {p13 o13 n13 m13   l13 k13 j13 i13   h13 g13 f13 e13   d13 c13 b13 a13}
+; r14  = {p14 o14 n14 m14   l14 k14 j14 i14   h14 g14 f14 e14   d14 c14 b14 a14}
+; r15  = {p15 o15 n15 m15   l15 k15 j15 i15   h15 g15 f15 e15   d15 c15 b15 a15}
+
+
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b13 b12 a13 a12   b9  b8  a9  a8   b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b15 b14 a15 a14   b11 b10 a11 a10  b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d13 d12 c13 c12   d9  d8  c9  c8   d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d15 d14 c15 c14   d11 d10 c11 c10  d7 d6 c7 c6   d3 d2 c3 c2}
+
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d13 c13 b13 a13   d9  c9  b9  a9   d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d14 c14 b14 a14   d10 c10 b10 a10  d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d15 c15 b15 a15   d11 c11 b11 a11  d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d12 c12 b12 a12   d8  c8  b8  a8   d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f13 f12 e13 e12   f9  f8  e9  e8   f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f15 f14 e15 e14   f11 f10 e11 e10  f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h13 h12 g13 g12   h9  h8  g9  g8   h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h15 h14 g15 g14   h11 h10 g11 g10  h7 h6 g7 g6   h3 h2 g3 g2}
+
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h13 g13 f13 e13   h9  g9  f9  e9   h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h14 g14 f14 e14   h10 g10 f10 e10  h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h15 g15 f15 e15   h11 g11 f11 e11  h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%r2, %%r2, %%t1, 0x88	; r2 = {h12 g12 f12 e12   h8  g8  f8  e8   h4 g4 f4 e4   h0 g0 f0 e0}
+
+	; use r6 in place of t0
+	vshufps	%%r6, %%r8, %%r9,    0x44	; r6  = {j13 j12 i13 i12   j9  j8  i9  i8   j5 j4 i5 i4   j1 j0 i1 i0}
+	vshufps	%%r8, %%r8, %%r9,    0xEE	; r8  = {j15 j14 i15 i14   j11 j10 i11 i10  j7 j6 i7 i6   j3 j2 i3 i2}
+	vshufps	%%t1, %%r10, %%r11,  0x44	; t1  = {l13 l12 k13 k12   l9  l8  k9  k8   l5 l4 k5 k4   l1 l0 k1 k0}
+	vshufps	%%r10, %%r10, %%r11, 0xEE	; r10 = {l15 l14 k15 k14   l11 l10 k11 k10  l7 l6 k7 k6   l3 l2 k3 k2}
+
+	vshufps	%%r11, %%r6, %%t1, 0xDD		; r11 = {l13 k13 j13 113   l9  k9  j9  i9   l5 k5 j5 i5   l1 k1 j1 i1}
+	vshufps	%%r9, %%r8, %%r10, 0x88		; r9  = {l14 k14 j14 114   l10 k10 j10 i10  l6 k6 j6 i6   l2 k2 j2 i2}
+	vshufps	%%r8, %%r8, %%r10, 0xDD		; r8  = {l15 k15 j15 115   l11 k11 j11 i11  l7 k7 j7 i7   l3 k3 j3 i3}
+	vshufps	%%r6, %%r6, %%t1,  0x88		; r6  = {l12 k12 j12 112   l8  k8  j8  i8   l4 k4 j4 i4   l0 k0 j0 i0}
+
+	; use r10 in place of t0
+	vshufps	%%r10, %%r12, %%r13, 0x44	; r10 = {n13 n12 m13 m12   n9  n8  m9  m8   n5 n4 m5 m4   n1 n0 a1 m0}
+	vshufps	%%r12, %%r12, %%r13, 0xEE	; r12 = {n15 n14 m15 m14   n11 n10 m11 m10  n7 n6 m7 m6   n3 n2 a3 m2}
+	vshufps	%%t1, %%r14, %%r15,  0x44	; t1  = {p13 p12 013 012   p9  p8  09  08   p5 p4 05 04   p1 p0 01 00}
+	vshufps	%%r14, %%r14, %%r15, 0xEE	; r14 = {p15 p14 015 014   p11 p10 011 010  p7 p6 07 06   p3 p2 03 02}
+
+	vshufps	%%r15, %%r10, %%t1,  0xDD	; r15 = {p13 013 n13 m13   p9  09  n9  m9   p5 05 n5 m5   p1 01 n1 m1}
+	vshufps	%%r13, %%r12, %%r14, 0x88	; r13 = {p14 014 n14 m14   p10 010 n10 m10  p6 06 n6 m6   p2 02 n2 m2}
+	vshufps	%%r12, %%r12, %%r14, 0xDD	; r12 = {p15 015 n15 m15   p11 011 n11 m11  p7 07 n7 m7   p3 03 n3 m3}
+	vshufps	%%r10, %%r10, %%t1,  0x88	; r10 = {p12 012 n12 m12   p8  08  n8  m8   p4 04 n4 m4   p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+	vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r14, %%t0, %%r2		; r14 = {h8  g8  f8  e8   d8  c8  b8  a8   h0 g0 f0 e0	 d0 c0 b0 a0}
+	vmovdqa32 %%t1,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t1,  %%t0, %%r2		; t1  = {h12 g12 f12 e12  d12 c12 b12 a12  h4 g4 f4 e4	 d4 c4 b4 a4}
+
+	vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r2, %%r3, %%r7		; r2  = {h9  g9  f9  e9   d9  c9  b9  a9   h1 g1 f1 e1	 d1 c1 b1 a1}
+	vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t0, %%r3, %%r7		; t0  = {h13 g13 f13 e13  d13 c13 b13 a13  h5 g5 f5 e5	 d5 c5 b5 a5}
+
+	vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r3, %%r1, %%r5		; r3  = {h10 g10 f10 e10  d10 c10 b10 a10  h2 g2 f2 e2	 d2 c2 b2 a2}
+	vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r7, %%r1, %%r5		; r7  = {h14 g14 f14 e14  d14 c14 b14 a14  h6 g6 f6 e6	 d6 c6 b6 a6}
+
+	vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r1, %%r0, %%r4		; r1  = {h11 g11 f11 e11  d11 c11 b11 a11  h3 g3 f3 e3	 d3 c3 b3 a3}
+	vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r5, %%r0, %%r4		; r5  = {h15 g15 f15 e15  d15 c15 b15 a15  h7 g7 f7 e7	 d7 c7 b7 a7}
+
+	vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r0, %%r6, %%r10		; r0 = {p8  o8  n8  m8   l8  k8  j8  i8   p0 o0 n0 m0	 l0 k0 j0 i0}
+	vmovdqa32 %%r4,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r4, %%r6, %%r10		; r4  = {p12 o12 n12 m12  l12 k12 j12 i12  p4 o4 n4 m4	 l4 k4 j4 i4}
+
+	vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r6, %%r11, %%r15		; r6  = {p9  o9  n9  m9   l9  k9  j9  i9   p1 o1 n1 m1	 l1 k1 j1 i1}
+	vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r10, %%r11, %%r15		; r10 = {p13 o13 n13 m13  l13 k13 j13 i13  p5 o5 n5 m5	 l5 k5 j5 i5}
+
+	vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r11, %%r9, %%r13		; r11 = {p10 o10 n10 m10  l10 k10 j10 i10  p2 o2 n2 m2	 l2 k2 j2 i2}
+	vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r15, %%r9, %%r13		; r15 = {p14 o14 n14 m14  l14 k14 j14 i14  p6 o6 n6 m6	 l6 k6 j6 i6}
+
+	vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r9, %%r8, %%r12		; r9  = {p11 o11 n11 m11  l11 k11 j11 i11  p3 o3 n3 m3	 l3 k3 j3 i3}
+	vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r13, %%r8, %%r12		; r13 = {p15 o15 n15 m15  l15 k15 j15 i15  p7 o7 n7 m7	 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+	vshuff64x2 %%r8, %%r14, %%r0, 0xEE 	; r8  = {p8  o8  n8  m8   l8  k8  j8  i8   h8 g8 f8 e8   d8 c8 b8 a8}
+	vshuff64x2 %%r0, %%r14, %%r0, 0x44 	; r0  = {p0  o0  n0  m0   l0  k0  j0  i0   h0 g0 f0 e0   d0 c0 b0 a0}
+
+	vshuff64x2 %%r12, %%t1, %%r4, 0xEE 	; r12 = {p12 o12 n12 m12  l12 k12 j12 i12  h12 g12 f12 e12  d12 c12 b12 a12}
+	vshuff64x2 %%r4, %%t1, %%r4, 0x44 	; r4  = {p4  o4  n4  m4   l4  k4  j4  i4   h4 g4 f4 e4   d4 c4 b4 a4}
+
+	vshuff64x2 %%r14, %%r7, %%r15, 0xEE 	; r14 = {p14 o14 n14 m14  l14 k14 j14 i14  h14 g14 f14 e14  d14 c14 b14 a14}
+	vshuff64x2 %%t1, %%r7, %%r15, 0x44 	; t1  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+	vshuff64x2 %%r15, %%r5, %%r13, 0xEE 	; r15 = {p15 o15 n15 m15  l15 k15 j15 i15  h15 g15 f15 e15  d15 c15 b15 a15}
+	vshuff64x2 %%r7, %%r5, %%r13, 0x44 	; r7  = {p7  o7  n7  m7   l7  k7  j7  i7   h7 g7 f7 e7   d7 c7 b7 a7}
+
+	vshuff64x2 %%r13, %%t0, %%r10, 0xEE 	; r13 = {p13 o13 n13 m13  l13 k13 j13 i13  h13 g13 f13 e13  d13 c13 b13 a13}
+	vshuff64x2 %%r5, %%t0, %%r10, 0x44 	; r5  = {p5  o5  n5  m5   l5  k5  j5  i5   h5 g5 f5 e5   d5 c5 b5 a5}
+
+	vshuff64x2 %%r10, %%r3, %%r11, 0xEE 	; r10 = {p10 o10 n10 m10  l10 k10 j10 i10  h10 g10 f10 e10  d10 c10 b10 a10}
+	vshuff64x2 %%t0, %%r3, %%r11, 0x44 	; t0  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+
+	vshuff64x2 %%r11, %%r1, %%r9, 0xEE 	; r11 = {p11 o11 n11 m11  l11 k11 j11 i11  h11 g11 f11 e11  d11 c11 b11 a11}
+	vshuff64x2 %%r3, %%r1, %%r9, 0x44 	; r3  = {p3  o3  n3  m3   l3  k3  j3  i3   h3 g3 f3 e3   d3 c3 b3 a3}
+
+	vshuff64x2 %%r9, %%r2, %%r6, 0xEE 	; r9  = {p9  o9  n9  m9   l9  k9  j9  i9   h9 g9 f9 e9   d9 c9 b9 a9}
+	vshuff64x2 %%r1, %%r2, %%r6, 0x44 	; r1  = {p1  o1  n1  m1   l1  k1  j1  i1   h1 g1 f1 e1   d1 c1 b1 a1}
+
+	vmovdqa32 %%r2, %%t0			; r2  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+	vmovdqa32 %%r6, %%t1			; r6  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;;  CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2  ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6  ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7  ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT	%1
+%define %%ROUND	%2
+	;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+	;; T2 = SIGMA0(A) + MAJ(A, B, C)
+	;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+	;; H becomes T2, then add T1 for A
+	;; D becomes D + T1 for E
+
+	vpaddd		T1, H, TMP3		; T1 = H + Kt
+	vmovdqa32	TMP0, E
+	vprord		TMP1, E, 6 		; ROR_6(E)
+	vprord		TMP2, E, 11 		; ROR_11(E)
+	vprord		TMP3, E, 25 		; ROR_25(E)
+	vpternlogd	TMP0, F, G, 0xCA	; TMP0 = CH(E,F,G)
+	vpaddd		T1, T1, %%WT		; T1 = T1 + Wt
+	vpternlogd	TMP1, TMP2, TMP3, 0x96	; TMP1 = SIGMA1(E)
+	vpaddd		T1, T1, TMP0		; T1 = T1 + CH(E,F,G)
+	vpaddd		T1, T1, TMP1		; T1 = T1 + SIGMA1(E)
+	vpaddd		D, D, T1		; D = D + T1
+
+	vprord		H, A, 2 		; ROR_2(A)
+	vprord		TMP2, A, 13 		; ROR_13(A)
+	vprord		TMP3, A, 22 		; ROR_22(A)
+	vmovdqa32	TMP0, A
+	vpternlogd	TMP0, B, C, 0xE8	; TMP0 = MAJ(A,B,C)
+	vpternlogd	H, TMP2, TMP3, 0x96	; H(T2) = SIGMA0(A)
+	vpaddd		H, H, TMP0		; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+	vpaddd		H, H, T1		; H(A) = H(T2) + T1
+
+	vmovdqa32	TMP3, [TBL + ((%%ROUND+1)*64)]	; Next Kt
+
+	;; Rotate the args A-H (rotation of names associated with regs)
+	ROTATE_ARGS
+%endmacro
+
+; This is supposed to be SKL optimized assuming:
+; vpternlog, vpaddd ports 5,8
+; vprord ports 1,8
+; However, vprord is only working on port 8
+;
+; Main processing loop per round
+; Get the msg schedule word 16 from the current, now unneccessary word
+%macro PROCESS_LOOP_00_47 5
+%define %%WT	%1
+%define %%ROUND	%2
+%define %%WTp1	%3
+%define %%WTp9	%4
+%define %%WTp14	%5
+	;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+	;; T2 = SIGMA0(A) + MAJ(A, B, C)
+	;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+	;; H becomes T2, then add T1 for A
+	;; D becomes D + T1 for E
+
+	;; For next value in msg schedule
+	;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt
+
+	vmovdqa32	TMP0, E
+	vprord		TMP1, E, 6 		; ROR_6(E)
+	vprord		TMP2, E, 11 		; ROR_11(E)
+	vprord		TMP3, E, 25 		; ROR_25(E)
+	vpternlogd	TMP0, F, G, 0xCA	; TMP0 = CH(E,F,G)
+	vpaddd		T1, H, %%WT		; T1 = H + Wt
+	vpternlogd	TMP1, TMP2, TMP3, 0x96	; TMP1 = SIGMA1(E)
+	vpaddd		T1, T1, TMP6		; T1 = T1 + Kt
+	vprord		H, A, 2 		; ROR_2(A)
+	vpaddd		T1, T1, TMP0		; T1 = T1 + CH(E,F,G)
+	vprord		TMP2, A, 13 		; ROR_13(A)
+	vmovdqa32	TMP0, A
+	vprord		TMP3, A, 22 		; ROR_22(A)
+	vpaddd		T1, T1, TMP1		; T1 = T1 + SIGMA1(E)
+	vpternlogd	TMP0, B, C, 0xE8	; TMP0 = MAJ(A,B,C)
+	vpaddd		D, D, T1		; D = D + T1
+	vpternlogd	H, TMP2, TMP3, 0x96	; H(T2) = SIGMA0(A)
+	vprord		TMP4, %%WTp14, 17 	; ROR_17(Wt-2)
+	vpaddd		H, H, TMP0		; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+	vprord		TMP5, %%WTp14, 19 	; ROR_19(Wt-2)
+	vpsrld		TMP6, %%WTp14, 10 	; SHR_10(Wt-2)
+	vpaddd		H, H, T1		; H(A) = H(T2) + T1
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma1(Wt-2)
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2)
+	vprord		TMP4, %%WTp1, 7 	; ROR_7(Wt-15)
+	vprord		TMP5, %%WTp1, 18 	; ROR_18(Wt-15)
+	vpaddd		%%WT, %%WT, %%WTp9	; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+	vpsrld		TMP6, %%WTp1, 3 	; SHR_3(Wt-15)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma0(Wt-15)
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2) +
+						;      Wt-7 + sigma0(Wt-15) +
+
+	vmovdqa32	TMP6, [TBL + ((%%ROUND+1)*64)]	; Next Kt
+
+	;; Rotate the args A-H (rotation of names associated with regs)
+	ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT	%1
+%define %%WTp1	%2
+%define %%WTp9	%3
+%define %%WTp14	%4
+	vprord		TMP4, %%WTp14, 17 	; ROR_17(Wt-2)
+	vprord		TMP5, %%WTp14, 19 	; ROR_19(Wt-2)
+	vpsrld		TMP6, %%WTp14, 10 	; SHR_10(Wt-2)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma1(Wt-2)
+
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2)
+	vpaddd		%%WT, %%WT, %%WTp9	; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+	vprord		TMP4, %%WTp1, 7 	; ROR_7(Wt-15)
+	vprord		TMP5, %%WTp1, 18 	; ROR_18(Wt-15)
+	vpsrld		TMP6, %%WTp1, 3 	; SHR_3(Wt-15)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma0(Wt-15)
+
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2) +
+						;      Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT	 %1
+%define %%OFFSET %2
+	mov		inp0, [IN + (%%OFFSET*8)]
+	vmovups		%%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+;; void sha256_mb_x16_avx512(SHA256_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha256_mb_x16_avx512)
+sha256_mb_x16_avx512:
+	endbranch
+	mov	rax, rsp
+        sub     rsp, STACK_SPACE
+	and	rsp, ~63	; align stack to multiple of 64
+	mov	[rsp + _rsp], rax
+	lea	TBL, [TABLE]
+
+	;; Initialize digests
+	vmovups	A, [DIGEST + 0*64]
+	vmovups	B, [DIGEST + 1*64]
+	vmovups	C, [DIGEST + 2*64]
+	vmovups	D, [DIGEST + 3*64]
+	vmovups	E, [DIGEST + 4*64]
+	vmovups	F, [DIGEST + 5*64]
+	vmovups	G, [DIGEST + 6*64]
+	vmovups	H, [DIGEST + 7*64]
+
+	; Do we need to transpose digests???
+	; SHA1 does not, but SHA256 has been
+
+	xor IDX, IDX
+
+	;; Read in first block of input data
+	;; Transpose input data
+	mov	inp0, [IN + 0*8]
+	mov	inp1, [IN + 1*8]
+	mov	inp2, [IN + 2*8]
+	mov	inp3, [IN + 3*8]
+	mov	inp4, [IN + 4*8]
+	mov	inp5, [IN + 5*8]
+	mov	inp6, [IN + 6*8]
+	mov	inp7, [IN + 7*8]
+
+	vmovups	W0,[inp0+IDX]
+	vmovups	W1,[inp1+IDX]
+	vmovups	W2,[inp2+IDX]
+	vmovups	W3,[inp3+IDX]
+	vmovups	W4,[inp4+IDX]
+	vmovups	W5,[inp5+IDX]
+	vmovups	W6,[inp6+IDX]
+	vmovups	W7,[inp7+IDX]
+
+	mov	inp0, [IN + 8*8]
+	mov	inp1, [IN + 9*8]
+	mov	inp2, [IN +10*8]
+	mov	inp3, [IN +11*8]
+	mov	inp4, [IN +12*8]
+	mov	inp5, [IN +13*8]
+	mov	inp6, [IN +14*8]
+	mov	inp7, [IN +15*8]
+
+	vmovups	W8, [inp0+IDX]
+	vmovups	W9, [inp1+IDX]
+	vmovups	W10,[inp2+IDX]
+	vmovups	W11,[inp3+IDX]
+	vmovups	W12,[inp4+IDX]
+	vmovups	W13,[inp5+IDX]
+	vmovups	W14,[inp6+IDX]
+	vmovups	W15,[inp7+IDX]
+
+
+lloop:
+	vmovdqa32	TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+	vmovdqa32	TMP3, [TBL]	; First K
+
+	; Save digests for later addition
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*0], A
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*1], B
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*2], C
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*3], D
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*4], E
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*5], F
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*6], G
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*7], H
+
+	add	IDX, 64
+
+	TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+       	vpshufb	APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+	; MSG Schedule for W0-W15 is now complete in registers
+	; Process first 48 rounds
+	; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+	; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 48
+	PROCESS_LOOP  APPEND(W,J),  I
+	MSG_SCHED_ROUND_16_63  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+	; Check is this is the last block
+	sub 	SIZE, 1
+	je	lastLoop
+
+	; Process last 16 rounds
+	; Read in next block msg data for use in first 16 words of msg sched
+%assign I 48
+%assign J 0
+%rep 16
+	PROCESS_LOOP  APPEND(W,J), I
+	MSG_SCHED_ROUND_00_15  APPEND(W,J), J
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+	; Add old digest
+        vpaddd		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddd		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddd		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddd		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddd		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddd		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddd		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddd		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+	jmp	lloop
+
+lastLoop:
+	; Process last 16 rounds
+%assign I 48
+%assign J 0
+%rep 16
+	PROCESS_LOOP  APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+	; Add old digest
+        vpaddd		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddd		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddd		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddd		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddd		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddd		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddd		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddd		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+        ;; update into data pointers
+%assign I 0
+%rep 8
+        mov    inp0, [IN + (2*I)*8]
+        mov    inp1, [IN + (2*I +1)*8]
+        add    inp0, IDX
+        add    inp1, IDX
+        mov    [IN + (2*I)*8], inp0
+        mov    [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+	; Write out digest
+	; Do we need to untranspose digests???
+	vmovups	[DIGEST + 0*64], A
+	vmovups	[DIGEST + 1*64], B
+	vmovups	[DIGEST + 2*64], C
+	vmovups	[DIGEST + 3*64], D
+	vmovups	[DIGEST + 4*64], E
+	vmovups	[DIGEST + 5*64], F
+	vmovups	[DIGEST + 6*64], G
+	vmovups	[DIGEST + 7*64], H
+
+
+        mov     rsp, [rsp + _rsp]
+        ret
+
+        section .data
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: 	dq 0x0000000000000000
+				dq 0x0000000000000001
+				dq 0x0000000000000008
+				dq 0x0000000000000009
+				dq 0x0000000000000004
+				dq 0x0000000000000005
+				dq 0x000000000000000C
+				dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: 	dq 0x0000000000000002
+				dq 0x0000000000000003
+				dq 0x000000000000000A
+				dq 0x000000000000000B
+				dq 0x0000000000000006
+				dq 0x0000000000000007
+				dq 0x000000000000000E
+				dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_x16_avx512
+no_sha256_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
new file mode 100644
index 000000000..7f8f8829b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
@@ -0,0 +1,431 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using AVX
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+
+	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	vshufps	%%r0, %%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE	K256_4_MB
+%define SZ	4
+%define SZ4	4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+
+
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ4*(%%i&0xf) + rsp], %%T1
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+
+	vmovdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp]
+	vmovdqa	a1, [SZ4*((%%i-2)&0xf) + rsp]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+	vpaddd	a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE	8*SZ4
+%define DATA	       16*SZ4
+%define ALIGNMENT       1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define VMOVPS	vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 	rdi
+ %define arg2	rsi
+%else
+ ; Windows definitions
+ %define arg1 	rcx
+ %define arg2 	rdx
+%endif
+
+; Common definitions
+%define IDX     rax
+%define ROUND	rbx
+%define TBL	r12
+
+;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : arg2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+mk_global sha256_mb_x4_avx, function, internal
+align 32
+sha256_mb_x4_avx:
+	endbranch
+	sub	rsp, FRAMESZ
+
+	;; Initialize digests
+	vmovdqa	a,[arg1+0*SZ4]
+	vmovdqa	b,[arg1+1*SZ4]
+	vmovdqa	c,[arg1+2*SZ4]
+	vmovdqa	d,[arg1+3*SZ4]
+	vmovdqa	e,[arg1+4*SZ4]
+	vmovdqa	f,[arg1+5*SZ4]
+	vmovdqa	g,[arg1+6*SZ4]
+	vmovdqa	h,[arg1+7*SZ4]
+
+	lea	TBL,[TABLE]
+
+	;; transpose input onto stack
+	mov	inp0,[arg1 + _data_ptr + 0*8]
+	mov	inp1,[arg1 + _data_ptr + 1*8]
+	mov	inp2,[arg1 + _data_ptr + 2*8]
+	mov	inp3,[arg1 + _data_ptr + 3*8]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ4], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ4], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ4], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ4], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ4], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ4], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ4], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+	vmovdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	VMOVPS	TT2,[inp0+IDX+i*16]
+	VMOVPS	TT1,[inp1+IDX+i*16]
+	VMOVPS	TT4,[inp2+IDX+i*16]
+	VMOVPS	TT3,[inp3+IDX+i*16]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	vpshufb	TT0, TT0, TMP
+	vpshufb	TT1, TT1, TMP
+	vpshufb	TT2, TT2, TMP
+	vpshufb	TT3, TT3, TMP
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+	add	IDX, 4*4*4
+
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	vpaddd	a, a, [rsp + _DIGEST + 0*SZ4]
+	vpaddd	b, b, [rsp + _DIGEST + 1*SZ4]
+	vpaddd	c, c, [rsp + _DIGEST + 2*SZ4]
+	vpaddd	d, d, [rsp + _DIGEST + 3*SZ4]
+	vpaddd	e, e, [rsp + _DIGEST + 4*SZ4]
+	vpaddd	f, f, [rsp + _DIGEST + 5*SZ4]
+	vpaddd	g, g, [rsp + _DIGEST + 6*SZ4]
+	vpaddd	h, h, [rsp + _DIGEST + 7*SZ4]
+
+
+	sub	arg2, 1
+	jne	lloop
+
+	; write digests out
+	vmovdqa	[arg1+0*SZ4],a
+	vmovdqa	[arg1+1*SZ4],b
+	vmovdqa	[arg1+2*SZ4],c
+	vmovdqa	[arg1+3*SZ4],d
+	vmovdqa	[arg1+4*SZ4],e
+	vmovdqa	[arg1+5*SZ4],f
+	vmovdqa	[arg1+6*SZ4],g
+	vmovdqa	[arg1+7*SZ4],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[arg1 + _data_ptr + 0*8], inp0
+	add	inp1, IDX
+	mov	[arg1 + _data_ptr + 1*8], inp1
+	add	inp2, IDX
+	mov	[arg1 + _data_ptr + 2*8], inp2
+	add	inp3, IDX
+	mov	[arg1 + _data_ptr + 3*8], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, FRAMESZ
+	ret
+
+section .data align=64
+
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
new file mode 100644
index 000000000..2d349abbc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
@@ -0,0 +1,426 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using SSE
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	movaps	%%t0, %%r0		; t0 = {a3 a2 a1 a0}
+	shufps	%%t0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	shufps	%%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	movaps	%%t1, %%r2		; t1 = {c3 c2 c1 c0}
+	shufps	%%t1, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	shufps	%%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	movaps	%%r1, %%t0		; r1 = {b1 b0 a1 a0}
+	shufps	%%r1, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+
+	movaps	%%r3, %%r0		; r3 = {b3 b2 a3 a2}
+	shufps	%%r3, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	shufps	%%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	shufps	%%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE	K256_4_MB
+%define SZ	4
+%define SZ4	4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa	%%tmp, %%reg
+	psrld	%%reg, %%imm
+	pslld	%%tmp, (32-(%%imm))
+	por	%%reg, %%tmp
+%endmacro
+
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+
+
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORD	a0, (11-6)	; sig1: a0 = (e >> 5)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORD	a1, 25		; sig1: a1 = (e >> 25)
+	movdqa	[SZ4*(%%i&0xf) + rsp],%%T1
+	paddd	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	paddd	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORD	a2, (13-2)	; sig0: a2 = (a >> 11)
+	paddd	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORD	a1, 22		; sig0: a1 = (a >> 22)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddd	h, a0
+
+	paddd	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddd	h, a1		; h = h + ch + W + K + maj
+	paddd	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+
+	movdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp]
+	movdqa	a1, [SZ4*((%%i-2)&0xf) + rsp]
+	movdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	movdqa	a2, a1
+	PRORD	a1, 19-17
+	pxor	%%T1, a0
+	PRORD	%%T1, 7
+	pxor	a1, a2
+	PRORD	a1, 17
+	psrld	a0, 3
+	pxor	%%T1, a0
+	psrld	a2, 10
+	pxor	a1, a2
+	paddd	%%T1, [SZ4*((%%i-16)&0xf) + rsp]
+	paddd	a1, [SZ4*((%%i-7)&0xf) + rsp]
+	paddd	%%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE	8*SZ4
+%define DATA	       16*SZ4
+%define ALIGNMENT       1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define MOVPS	movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 	rdi
+ %define arg2	rsi
+%else
+ ; Windows definitions
+ %define arg1 	rcx
+ %define arg2 	rdx
+%endif
+
+; Common definitions
+%define IDX     rax
+%define ROUND	rbx
+%define TBL	r12
+
+;; void sha256_mb_x4_sse(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 :  pointer args (only 4 of the 8 lanes used)
+;; arg 2 :  size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+
+mk_global sha256_mb_x4_sse, function, internal
+align 32
+sha256_mb_x4_sse:
+	endbranch
+	sub	rsp, FRAMESZ
+
+	;; Initialize digests
+	movdqa	a,[arg1+0*SZ4]
+	movdqa	b,[arg1+1*SZ4]
+	movdqa	c,[arg1+2*SZ4]
+	movdqa	d,[arg1+3*SZ4]
+	movdqa	e,[arg1+4*SZ4]
+	movdqa	f,[arg1+5*SZ4]
+	movdqa	g,[arg1+6*SZ4]
+	movdqa	h,[arg1+7*SZ4]
+
+	lea	TBL,[TABLE]
+
+	;; transpose input onto stack
+	mov	inp0,[arg1 + _data_ptr + 0*8]
+	mov	inp1,[arg1 + _data_ptr + 1*8]
+	mov	inp2,[arg1 + _data_ptr + 2*8]
+	mov	inp3,[arg1 + _data_ptr + 3*8]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	movdqa	[rsp + _DIGEST + 0*SZ4], a
+	movdqa	[rsp + _DIGEST + 1*SZ4], b
+	movdqa	[rsp + _DIGEST + 2*SZ4], c
+	movdqa	[rsp + _DIGEST + 3*SZ4], d
+	movdqa	[rsp + _DIGEST + 4*SZ4], e
+	movdqa	[rsp + _DIGEST + 5*SZ4], f
+	movdqa	[rsp + _DIGEST + 6*SZ4], g
+	movdqa	[rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+	movdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	MOVPS	TT2,[inp0+IDX+i*16]
+	MOVPS	TT1,[inp1+IDX+i*16]
+	MOVPS	TT4,[inp2+IDX+i*16]
+	MOVPS	TT3,[inp3+IDX+i*16]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	pshufb	TT0, TMP
+	pshufb	TT1, TMP
+	pshufb	TT2, TMP
+	pshufb	TT3, TMP
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+	add	IDX, 4*4*4
+
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	paddd	a, [rsp + _DIGEST + 0*SZ4]
+	paddd	b, [rsp + _DIGEST + 1*SZ4]
+	paddd	c, [rsp + _DIGEST + 2*SZ4]
+	paddd	d, [rsp + _DIGEST + 3*SZ4]
+	paddd	e, [rsp + _DIGEST + 4*SZ4]
+	paddd	f, [rsp + _DIGEST + 5*SZ4]
+	paddd	g, [rsp + _DIGEST + 6*SZ4]
+	paddd	h, [rsp + _DIGEST + 7*SZ4]
+
+
+	sub	arg2, 1
+	jne	lloop
+
+	; write digests out
+	movdqa	[arg1+0*SZ4],a
+	movdqa	[arg1+1*SZ4],b
+	movdqa	[arg1+2*SZ4],c
+	movdqa	[arg1+3*SZ4],d
+	movdqa	[arg1+4*SZ4],e
+	movdqa	[arg1+5*SZ4],f
+	movdqa	[arg1+6*SZ4],g
+	movdqa	[arg1+7*SZ4],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[arg1 + _data_ptr + 0*8], inp0
+	add	inp1, IDX
+	mov	[arg1 + _data_ptr + 1*8], inp1
+	add	inp2, IDX
+	mov	[arg1 + _data_ptr + 2*8], inp2
+	add	inp3, IDX
+	mov	[arg1 + _data_ptr + 3*8], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, FRAMESZ
+	ret
+
+section .data align=64
+
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
new file mode 100644
index 000000000..dbd9db1b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
@@ -0,0 +1,620 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers:  rax rbx     rdx rsi rdi        r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp r8
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+     %define arg1 	rdi
+     %define arg2	rsi
+     %define reg3	rcx
+     %define reg4	rdx
+%else
+ ; Windows definitions
+     %define arg1 	rcx
+     %define arg2 	rdx
+     %define reg3	rsi
+     %define reg4	rdi
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND	rbx
+%define TBL	reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+; ymm0	a
+; ymm1	b
+; ymm2	c
+; ymm3	d
+; ymm4	e
+; ymm5	f
+; ymm6	g	TMP0
+; ymm7	h	TMP1
+; ymm8	T1	TT0
+; ymm9		TT1
+; ymm10		TT2
+; ymm11		TT3
+; ymm12	a0	TT4
+; ymm13	a1	TT5
+; ymm14	a2	TT6
+; ymm15	TMP	TT7
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define T1  ymm8
+
+%define a0 ymm12
+%define a1 ymm13
+%define a2 ymm14
+%define TMP ymm15
+
+%define TMP0 ymm6
+%define TMP1 ymm7
+
+%define TT0 ymm8
+%define TT1 ymm9
+%define TT2 ymm10
+%define TT3 ymm11
+%define TT4 ymm12
+%define TT5 ymm13
+%define TT6 ymm14
+%define TT7 ymm15
+
+%define SZ8	8*SHA256_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS	64*SZ8
+%define PTR_SZ                  8
+%define SHA256_DIGEST_WORD_SIZE	4
+%define MAX_SHA256_LANES	8
+%define NUM_SHA256_DIGEST_WORDS	8
+%define SHA256_DIGEST_ROW_SIZE	(MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+  .data		resb	16*SZ8
+  .digest	resb	8*SZ8
+  .ytmp		resb	4*SZ8
+  .rsp		resb	8
+endstruc
+%define FRAMESZ	stack_frame_size
+%define _DIGEST	stack_frame.digest
+%define _YTMP	stack_frame.ytmp
+%define _RSP_SAVE	stack_frame.rsp
+
+%define YTMP0	rsp + _YTMP + 0*SZ8
+%define YTMP1	rsp + _YTMP + 1*SZ8
+%define YTMP2	rsp + _YTMP + 2*SZ8
+%define YTMP3	rsp + _YTMP + 3*SZ8
+
+%define VMOVPS	vmovups
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	; process bottom half (r4..r7) {e...h}
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%t1, %%r2, %%t1, 0x88	; t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+	vperm2f128	%%r6, %%r5, %%r1, 0x13	; h6...a6
+	vperm2f128	%%r2, %%r5, %%r1, 0x02	; h2...a2
+	vperm2f128	%%r5, %%r7, %%r3, 0x13	; h5...a5
+	vperm2f128	%%r1, %%r7, %%r3, 0x02	; h1...a1
+	vperm2f128	%%r7, %%r4, %%r0, 0x13	; h7...a7
+	vperm2f128	%%r3, %%r4, %%r0, 0x02	; h3...a3
+	vperm2f128	%%r4, %%t1, %%t0, 0x13	; h4...a4
+	vperm2f128	%%r0, %%t1, %%t0, 0x02	; h0...a0
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ8*(%%i&0xf) + rsp], %%T1
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ8	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ8*((%%i-15)&0xf) + rsp]
+	vmovdqa	a1, [SZ8*((%%i-2)&0xf) + rsp]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
+	vpaddd	a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE  : size of input in blocks
+mk_global sha256_mb_x8_avx2, function, internal
+align 16
+sha256_mb_x8_avx2:
+	endbranch
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+
+	; save rsp, allocate 32-byte aligned for local variables
+	mov	IDX, rsp
+	sub	rsp, FRAMESZ
+	and	rsp, ~31
+	mov	[rsp + _RSP_SAVE], IDX
+
+
+	;; Load the pre-transposed incoming digest.
+	vmovdqu	a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+	lea	TBL,[K256_8_MB]
+
+	;; load the address of each of the 4 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+	mov	inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+	mov	inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+	mov	inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+	mov	inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+	mov	inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+	mov	inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+	mov	inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ8], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ8], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ8], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ8], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ8], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ8], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ8], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ8], h
+%assign i 0
+%rep 2
+	VMOVPS	TT0,[inp0+IDX+i*32]
+	VMOVPS	TT1,[inp1+IDX+i*32]
+	VMOVPS	TT2,[inp2+IDX+i*32]
+	VMOVPS	TT3,[inp3+IDX+i*32]
+	VMOVPS	TT4,[inp4+IDX+i*32]
+	VMOVPS	TT5,[inp5+IDX+i*32]
+	VMOVPS	TT6,[inp6+IDX+i*32]
+	VMOVPS	TT7,[inp7+IDX+i*32]
+	vmovdqa	[YTMP0], g
+	vmovdqa	[YTMP1], h
+	TRANSPOSE8	TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7,   TMP0, TMP1
+	vmovdqa	TMP1, [PSHUFFLE_BYTE_FLIP_MASK]
+	vmovdqa	g, [YTMP0]
+	vpshufb	TT0, TT0, TMP1
+	vpshufb	TT1, TT1, TMP1
+	vpshufb	TT2, TT2, TMP1
+	vpshufb	TT3, TT3, TMP1
+	vpshufb	TT4, TT4, TMP1
+	vpshufb	TT5, TT5, TMP1
+	vpshufb	TT6, TT6, TMP1
+	vpshufb	TT7, TT7, TMP1
+	vmovdqa	h, [YTMP1]
+	vmovdqa	[YTMP0], TT4
+	vmovdqa	[YTMP1], TT5
+	vmovdqa	[YTMP2], TT6
+	vmovdqa	[YTMP3], TT7
+	ROUND_00_15	TT0,(i*8+0)
+	vmovdqa	TT0, [YTMP0]
+	ROUND_00_15	TT1,(i*8+1)
+	vmovdqa	TT1, [YTMP1]
+	ROUND_00_15	TT2,(i*8+2)
+	vmovdqa	TT2, [YTMP2]
+	ROUND_00_15	TT3,(i*8+3)
+	vmovdqa	TT3, [YTMP3]
+	ROUND_00_15	TT0,(i*8+4)
+	ROUND_00_15	TT1,(i*8+5)
+	ROUND_00_15	TT2,(i*8+6)
+	ROUND_00_15	TT3,(i*8+7)
+%assign i (i+1)
+%endrep
+	add	IDX, 4*4*4
+
+%assign i (i*8)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	vpaddd	a, a, [rsp + _DIGEST + 0*SZ8]
+	vpaddd	b, b, [rsp + _DIGEST + 1*SZ8]
+	vpaddd	c, c, [rsp + _DIGEST + 2*SZ8]
+	vpaddd	d, d, [rsp + _DIGEST + 3*SZ8]
+	vpaddd	e, e, [rsp + _DIGEST + 4*SZ8]
+	vpaddd	f, f, [rsp + _DIGEST + 5*SZ8]
+	vpaddd	g, g, [rsp + _DIGEST + 6*SZ8]
+	vpaddd	h, h, [rsp + _DIGEST + 7*SZ8]
+
+	sub	INP_SIZE, 1  ;; unit is blocks
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	vmovdqu	[STATE + 0*SHA256_DIGEST_ROW_SIZE],a
+	vmovdqu	[STATE + 1*SHA256_DIGEST_ROW_SIZE],b
+	vmovdqu	[STATE + 2*SHA256_DIGEST_ROW_SIZE],c
+	vmovdqu	[STATE + 3*SHA256_DIGEST_ROW_SIZE],d
+	vmovdqu	[STATE + 4*SHA256_DIGEST_ROW_SIZE],e
+	vmovdqu	[STATE + 5*SHA256_DIGEST_ROW_SIZE],f
+	vmovdqu	[STATE + 6*SHA256_DIGEST_ROW_SIZE],g
+	vmovdqu	[STATE + 7*SHA256_DIGEST_ROW_SIZE],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _args_data_ptr + 0*8], inp0
+	add	inp1, IDX
+	mov	[STATE + _args_data_ptr + 1*8], inp1
+	add	inp2, IDX
+	mov	[STATE + _args_data_ptr + 2*8], inp2
+	add	inp3, IDX
+	mov	[STATE + _args_data_ptr + 3*8], inp3
+	add	inp4, IDX
+	mov	[STATE + _args_data_ptr + 4*8], inp4
+	add	inp5, IDX
+	mov	[STATE + _args_data_ptr + 5*8], inp5
+	add	inp6, IDX
+	mov	[STATE + _args_data_ptr + 6*8], inp6
+	add	inp7, IDX
+	mov	[STATE + _args_data_ptr + 7*8], inp7
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+	mov	rsp, [rsp + _RSP_SAVE]
+	ret
+
+section .data
+align 64
+K256_8_MB:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
new file mode 100644
index 000000000..af54f7cc3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
@@ -0,0 +1,125 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha256_ctx_mgr_init_sse
+extern sha256_ctx_mgr_submit_sse
+extern sha256_ctx_mgr_flush_sse
+
+extern sha256_ctx_mgr_init_avx
+extern sha256_ctx_mgr_submit_avx
+extern sha256_ctx_mgr_flush_avx
+
+extern sha256_ctx_mgr_init_avx2
+extern sha256_ctx_mgr_submit_avx2
+extern sha256_ctx_mgr_flush_avx2
+
+extern sha256_ctx_mgr_init_base
+extern sha256_ctx_mgr_submit_base
+extern sha256_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha256_ctx_mgr_init_avx512
+ extern sha256_ctx_mgr_submit_avx512
+ extern sha256_ctx_mgr_flush_avx512
+%endif
+
+%ifdef HAVE_AS_KNOWS_SHANI
+ extern sha256_ctx_mgr_init_sse_ni
+ extern sha256_ctx_mgr_submit_sse_ni
+ extern sha256_ctx_mgr_flush_sse_ni
+%endif
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+  extern sha256_ctx_mgr_init_avx512_ni
+  extern sha256_ctx_mgr_submit_avx512_ni
+  extern sha256_ctx_mgr_flush_avx512_ni
+ %endif
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6's extension through replacing base by sse version
+ %ifdef HAVE_AS_KNOWS_SHANI
+  mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+	sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+	sha256_ctx_mgr_init_avx512, sha256_ctx_mgr_init_sse_ni, sha256_ctx_mgr_init_avx512_ni
+  mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+	sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+	sha256_ctx_mgr_submit_avx512, sha256_ctx_mgr_submit_sse_ni, sha256_ctx_mgr_submit_avx512_ni
+  mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+	sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+	sha256_ctx_mgr_flush_avx512, sha256_ctx_mgr_flush_sse_ni, sha256_ctx_mgr_flush_avx512_ni
+ %else
+  mbin_dispatch_init6 sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+	sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+	sha256_ctx_mgr_init_avx512
+  mbin_dispatch_init6 sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+	sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+	sha256_ctx_mgr_submit_avx512
+  mbin_dispatch_init6 sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+	sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+	sha256_ctx_mgr_flush_avx512
+ %endif
+%else
+ %ifdef HAVE_AS_KNOWS_SHANI
+  mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+	sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, sha256_ctx_mgr_init_sse_ni
+  mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+	sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, sha256_ctx_mgr_submit_sse_ni
+  mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+	sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, sha256_ctx_mgr_flush_sse_ni
+ %else
+  mbin_dispatch_init sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+	sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2
+  mbin_dispatch_init sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+	sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2
+  mbin_dispatch_init sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+	sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2
+ %endif
+%endif
+
+;;;       func  			core, ver, snum
+slversion sha256_ctx_mgr_init,  	00,   04,  0160
+slversion sha256_ctx_mgr_submit,	00,   04,  0161
+slversion sha256_ctx_mgr_flush, 	00,   04,  0162
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
new file mode 100644
index 000000000..25fc9ce16
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
@@ -0,0 +1,361 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+%define MSG     	xmm0
+%define STATE0  	xmm1
+%define STATE1  	xmm2
+%define MSGTMP0 	xmm3
+%define MSGTMP1 	xmm4
+%define MSGTMP2 	xmm5
+%define MSGTMP3 	xmm6
+%define MSGTMP4 	xmm7
+
+%define SHUF_MASK       xmm8
+
+%define ABEF_SAVE       xmm9
+%define CDGH_SAVE       xmm10
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR     arg0
+%define NBLK    arg1
+%define NLANX4  r10     ; consistent with caller
+%define IDX     r8      ; local variable -- consistent with caller
+%define DPTR    r11     ; local variable -- input buffer pointer
+%define TMP     r9      ; local variable -- assistant to address digest
+%define TBL     rax
+;%define TMP2   r8      ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x1(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r11, xmm0-xmm10
+;
+mk_global sha256_ni_x1, function, internal
+sha256_ni_x1:
+	endbranch
+	shl     NBLK, 6 	; transform blk amount into bytes
+	jz      backto_mgr
+
+	; detach idx from nlanx4
+	mov     IDX, NLANX4
+	shr     NLANX4, 8
+	and     IDX, 0xff
+
+	lea     TMP, [MGR + 4*IDX]
+	;; Initialize digest
+	;; digests -> ABEF(state0), CDGH(state1)
+	pinsrd  STATE0, [TMP + 0*NLANX4], 3     ; A
+	pinsrd  STATE0, [TMP + 1*NLANX4], 2     ; B
+	pinsrd  STATE1, [TMP + 2*NLANX4], 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pinsrd  STATE1, [TMP + 1*NLANX4], 2     ; D
+	pinsrd  STATE0, [TMP + 2*NLANX4], 1     ; E
+	pinsrd  STATE1, [TMP + 4*NLANX4], 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pinsrd  STATE0, [TMP + 2*NLANX4], 0     ; F
+	pinsrd  STATE1, [TMP + 4*NLANX4], 0     ; H
+
+	movdqa  SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+	lea     TBL, [TABLE]
+
+	;; Load input pointers
+	mov     DPTR, [MGR + _data_ptr + IDX*8]
+	;; nblk is used to indicate data end
+	add     NBLK, DPTR
+
+lloop:
+	; /* Save hash values for addition after rounds */
+	movdqa  	ABEF_SAVE, STATE0
+	movdqa  	CDGH_SAVE, STATE1
+
+	; /* Rounds 0-3 */
+	movdqu  	MSG, [DPTR + 0*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP0, MSG
+		paddd   	MSG, [TBL + 0*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Rounds 4-7 */
+	movdqu  	MSG, [DPTR + 1*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP1, MSG
+		paddd   	MSG, [TBL + 1*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	; /* Rounds 8-11 */
+	movdqu  	MSG, [DPTR + 2*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP2, MSG
+		paddd   	MSG, [TBL + 2*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	; /* Rounds 12-15 */
+	movdqu  	MSG, [DPTR + 3*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP3, MSG
+		paddd   	MSG, [TBL + 3*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	; /* Rounds 16-19 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 4*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	; /* Rounds 20-23 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 5*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	; /* Rounds 24-27 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 6*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	; /* Rounds 28-31 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 7*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	; /* Rounds 32-35 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 8*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	; /* Rounds 36-39 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 9*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	; /* Rounds 40-43 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 10*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	; /* Rounds 44-47 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 11*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	; /* Rounds 48-51 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 12*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	; /* Rounds 52-55 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 13*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Rounds 56-59 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 14*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Rounds 60-63 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 15*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Add current hash values with previously saved */
+	paddd   	STATE0, ABEF_SAVE
+	paddd   	STATE1, CDGH_SAVE
+
+	; Increment data pointer and loop if more to process
+	add     	DPTR, 64
+	cmp     	DPTR, NBLK
+	jne     	lloop
+
+	; write out digests
+	lea     TMP, [MGR + 4*IDX]
+	;; ABEF(state0), CDGH(state1) -> digests
+	pextrd  [TMP + 0*NLANX4], STATE0, 3     ; A
+	pextrd  [TMP + 1*NLANX4], STATE0, 2     ; B
+	pextrd  [TMP + 2*NLANX4], STATE1, 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pextrd  [TMP + 1*NLANX4], STATE1, 2     ; D
+	pextrd  [TMP + 2*NLANX4], STATE0, 1     ; E
+	pextrd  [TMP + 4*NLANX4], STATE1, 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pextrd  [TMP + 2*NLANX4], STATE0, 0     ; F
+	pextrd  [TMP + 4*NLANX4], STATE1, 0     ; H
+
+	; update input pointers
+	mov     [MGR + _data_ptr + IDX*8], DPTR
+
+backto_mgr:
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	ret
+
+
+section .data align=16
+PSHUFFLE_SHANI_MASK:    dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE:	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd      0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd      0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd      0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd      0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd      0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd      0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd      0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd      0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd      0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd      0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd      0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd      0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd      0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd      0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd      0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x1
+no_sha256_ni_x1:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
new file mode 100644
index 000000000..74cfc93b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
@@ -0,0 +1,574 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ	64		; space for ABCDE
+%define RSPSAVE	rax
+
+%define MSG     	xmm0
+%define STATE0  	xmm1
+%define STATE1  	xmm2
+%define MSGTMP0 	xmm3
+%define MSGTMP1 	xmm4
+%define MSGTMP2 	xmm5
+%define MSGTMP3 	xmm6
+%define MSGTMP4 	xmm7
+
+%define STATE0b		xmm8
+%define STATE1b		xmm9
+%define MSGTMP0b	xmm10
+%define MSGTMP1b	xmm11
+%define MSGTMP2b	xmm12
+%define MSGTMP3b	xmm13
+%define MSGTMP4b	xmm14
+
+%define SHUF_MASK       xmm15
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR     arg0
+%define NBLK    arg1
+%define NLANX4  r10     ; consistent with caller
+%define IDX     r8      ; local variable -- consistent with caller
+%define DPTR    r11     ; local variable -- input buffer pointer
+%define DPTRb   r12
+%define TMP     r9      ; local variable -- assistant to address digest
+%define TBL     r13
+%define TMPb    r14      ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r14, xmm0-xmm15
+;
+mk_global sha256_ni_x2, function, internal
+sha256_ni_x2:
+	endbranch
+	mov 	RSPSAVE, rsp
+	sub 	rsp, FRAMESZ
+	and 	rsp, ~0xF	; Align 16Bytes downward
+
+	shl     NBLK, 6		; transform blk amount into bytes
+	jz      backto_mgr
+
+	; detach idx from nlanx4
+	mov     IDX, NLANX4
+	shr     NLANX4, 8
+	and     IDX, 0xff
+
+	lea     TMP, [MGR + 4*0]
+	lea     TMPb, [MGR + 4*1]
+
+	;; Initialize digest
+	;; digests -> ABEF(state0), CDGH(state1)
+	pinsrd  STATE0, [TMP + 0*NLANX4], 3     ; A
+	pinsrd  STATE0, [TMP + 1*NLANX4], 2     ; B
+	pinsrd  STATE1, [TMP + 2*NLANX4], 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pinsrd  STATE1, [TMP + 1*NLANX4], 2     ; D
+	pinsrd  STATE0, [TMP + 2*NLANX4], 1     ; E
+	pinsrd  STATE1, [TMP + 4*NLANX4], 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pinsrd  STATE0, [TMP + 2*NLANX4], 0     ; F
+	pinsrd  STATE1, [TMP + 4*NLANX4], 0     ; H
+
+	pinsrd  STATE0b, [TMPb + 0*NLANX4], 3     ; A
+	pinsrd  STATE0b, [TMPb + 1*NLANX4], 2     ; B
+	pinsrd  STATE1b, [TMPb + 2*NLANX4], 3     ; C
+	lea     TMPb, [TMPb + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pinsrd  STATE1b, [TMPb + 1*NLANX4], 2     ; D
+	pinsrd  STATE0b, [TMPb + 2*NLANX4], 1     ; E
+	pinsrd  STATE1b, [TMPb + 4*NLANX4], 1     ; G
+	lea     TMPb, [TMPb + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pinsrd  STATE0b, [TMPb + 2*NLANX4], 0     ; F
+	pinsrd  STATE1b, [TMPb + 4*NLANX4], 0     ; H
+
+	movdqa  SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+	lea     TBL, [TABLE]
+
+	;; Load input pointers
+	mov     DPTR, [MGR + _data_ptr + 8*0]
+	mov     DPTRb,[MGR + _data_ptr + 8*1]
+	;; nblk is used to indicate data end
+	add     NBLK, DPTR
+
+lloop:
+	; /* Save hash values for addition after rounds */
+	movdqa		[rsp + 0*16], STATE0
+	movdqa		[rsp + 1*16], STATE1
+
+	movdqa		[rsp + 2*16], STATE0b
+	movdqa		[rsp + 3*16], STATE1b
+
+	; /* Rounds 0-3 */
+	movdqu  	MSG, [DPTR + 0*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP0, MSG
+		paddd   	MSG, [TBL + 0*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqu  	MSG, [DPTRb + 0*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP0b, MSG
+		paddd   	MSG, [TBL + 0*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Rounds 4-7 */
+	movdqu  	MSG, [DPTR + 1*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP1, MSG
+		paddd   	MSG, [TBL + 1*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	movdqu  	MSG, [DPTRb + 1*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP1b, MSG
+		paddd   	MSG, [TBL + 1*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP0b, MSGTMP1b
+
+	; /* Rounds 8-11 */
+	movdqu  	MSG, [DPTR + 2*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP2, MSG
+		paddd   	MSG, [TBL + 2*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	movdqu  	MSG, [DPTRb + 2*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP2b, MSG
+		paddd   	MSG, [TBL + 2*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP1b, MSGTMP2b
+
+	; /* Rounds 12-15 */
+	movdqu  	MSG, [DPTR + 3*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP3, MSG
+		paddd   	MSG, [TBL + 3*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	movdqu  	MSG, [DPTRb + 3*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP3b, MSG
+		paddd   	MSG, [TBL + 3*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP3b
+	palignr 	MSGTMP4b, MSGTMP2b, 4
+	paddd   	MSGTMP0b, MSGTMP4b
+	sha256msg2      MSGTMP0b, MSGTMP3b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP2b, MSGTMP3b
+
+	; /* Rounds 16-19 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 4*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	movdqa  	MSG, MSGTMP0b
+		paddd   	MSG, [TBL + 4*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP0b
+	palignr 	MSGTMP4b, MSGTMP3b, 4
+	paddd   	MSGTMP1b, MSGTMP4b
+	sha256msg2      MSGTMP1b, MSGTMP0b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP3b, MSGTMP0b
+
+	; /* Rounds 20-23 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 5*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	movdqa  	MSG, MSGTMP1b
+		paddd   	MSG, [TBL + 5*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP1b
+	palignr 	MSGTMP4b, MSGTMP0b, 4
+	paddd   	MSGTMP2b, MSGTMP4b
+	sha256msg2      MSGTMP2b, MSGTMP1b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP0b, MSGTMP1b
+
+	; /* Rounds 24-27 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 6*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	movdqa  	MSG, MSGTMP2b
+		paddd   	MSG, [TBL + 6*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP2b
+	palignr 	MSGTMP4b, MSGTMP1b, 4
+	paddd   	MSGTMP3b, MSGTMP4b
+	sha256msg2      MSGTMP3b, MSGTMP2b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP1b, MSGTMP2b
+
+	; /* Rounds 28-31 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 7*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	movdqa  	MSG, MSGTMP3b
+		paddd   	MSG, [TBL + 7*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP3b
+	palignr 	MSGTMP4b, MSGTMP2b, 4
+	paddd   	MSGTMP0b, MSGTMP4b
+	sha256msg2      MSGTMP0b, MSGTMP3b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP2b, MSGTMP3b
+
+	; /* Rounds 32-35 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 8*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	movdqa  	MSG, MSGTMP0b
+		paddd   	MSG, [TBL + 8*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP0b
+	palignr 	MSGTMP4b, MSGTMP3b, 4
+	paddd   	MSGTMP1b, MSGTMP4b
+	sha256msg2      MSGTMP1b, MSGTMP0b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP3b, MSGTMP0b
+
+	; /* Rounds 36-39 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 9*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	movdqa  	MSG, MSGTMP1b
+		paddd   	MSG, [TBL + 9*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP1b
+	palignr 	MSGTMP4b, MSGTMP0b, 4
+	paddd   	MSGTMP2b, MSGTMP4b
+	sha256msg2      MSGTMP2b, MSGTMP1b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP0b, MSGTMP1b
+
+	; /* Rounds 40-43 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 10*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	movdqa  	MSG, MSGTMP2b
+		paddd   	MSG, [TBL + 10*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP2b
+	palignr 	MSGTMP4b, MSGTMP1b, 4
+	paddd   	MSGTMP3b, MSGTMP4b
+	sha256msg2      MSGTMP3b, MSGTMP2b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP1b, MSGTMP2b
+
+	; /* Rounds 44-47 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 11*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	movdqa  	MSG, MSGTMP3b
+		paddd   	MSG, [TBL + 11*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP3b
+	palignr 	MSGTMP4b, MSGTMP2b, 4
+	paddd   	MSGTMP0b, MSGTMP4b
+	sha256msg2      MSGTMP0b, MSGTMP3b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP2b, MSGTMP3b
+
+	; /* Rounds 48-51 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 12*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	movdqa  	MSG, MSGTMP0b
+		paddd   	MSG, [TBL + 12*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP0b
+	palignr 	MSGTMP4b, MSGTMP3b, 4
+	paddd   	MSGTMP1b, MSGTMP4b
+	sha256msg2      MSGTMP1b, MSGTMP0b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP3b, MSGTMP0b
+
+	; /* Rounds 52-55 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 13*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqa  	MSG, MSGTMP1b
+		paddd   	MSG, [TBL + 13*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP1b
+	palignr 	MSGTMP4b, MSGTMP0b, 4
+	paddd   	MSGTMP2b, MSGTMP4b
+	sha256msg2      MSGTMP2b, MSGTMP1b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Rounds 56-59 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 14*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqa  	MSG, MSGTMP2b
+		paddd   	MSG, [TBL + 14*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP2b
+	palignr 	MSGTMP4b, MSGTMP1b, 4
+	paddd   	MSGTMP3b, MSGTMP4b
+	sha256msg2      MSGTMP3b, MSGTMP2b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Rounds 60-63 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 15*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqa  	MSG, MSGTMP3b
+		paddd   	MSG, [TBL + 15*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Add current hash values with previously saved */
+	paddd   	STATE0, [rsp + 0*16]
+	paddd   	STATE1, [rsp + 1*16]
+
+	paddd   	STATE0b, [rsp + 2*16]
+	paddd   	STATE1b, [rsp + 3*16]
+
+	; Increment data pointer and loop if more to process
+	add     	DPTR, 64
+	add     	DPTRb, 64
+	cmp     	DPTR, NBLK
+	jne     	lloop
+
+	; write out digests
+	lea     TMP, [MGR + 4*0]
+	;; ABEF(state0), CDGH(state1) -> digests
+	pextrd  [TMP + 0*NLANX4], STATE0, 3     ; A
+	pextrd  [TMP + 1*NLANX4], STATE0, 2     ; B
+	pextrd  [TMP + 2*NLANX4], STATE1, 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pextrd  [TMP + 1*NLANX4], STATE1, 2     ; D
+	pextrd  [TMP + 2*NLANX4], STATE0, 1     ; E
+	pextrd  [TMP + 4*NLANX4], STATE1, 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pextrd  [TMP + 2*NLANX4], STATE0, 0     ; F
+	pextrd  [TMP + 4*NLANX4], STATE1, 0     ; H
+
+	lea     TMPb, [MGR + 4*1]
+	;; ABEF(state0), CDGH(state1) -> digests
+	pextrd  [TMPb + 0*NLANX4], STATE0b, 3     ; A
+	pextrd  [TMPb + 1*NLANX4], STATE0b, 2     ; B
+	pextrd  [TMPb + 2*NLANX4], STATE1b, 3     ; C
+	lea     TMPb, [TMPb + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pextrd  [TMPb + 1*NLANX4], STATE1b, 2     ; D
+	pextrd  [TMPb + 2*NLANX4], STATE0b, 1     ; E
+	pextrd  [TMPb + 4*NLANX4], STATE1b, 1     ; G
+	lea     TMPb, [TMPb + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pextrd  [TMPb + 2*NLANX4], STATE0b, 0     ; F
+	pextrd  [TMPb + 4*NLANX4], STATE1b, 0     ; H
+
+	; update input pointers
+	mov     [MGR + _data_ptr + 0*8], DPTR
+	mov     [MGR + _data_ptr + 1*8], DPTRb
+
+backto_mgr:
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+	mov     rsp, RSPSAVE
+
+	ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK:    dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE:	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd      0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd      0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd      0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd      0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd      0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd      0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd      0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd      0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd      0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd      0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd      0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd      0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd      0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd      0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd      0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x2
+no_sha256_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
new file mode 100644
index 000000000..fc13ec279
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
@@ -0,0 +1,567 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+;
+; Copyright (C) 2013 Intel Corporation.
+;
+; Authors:
+;     James Guilford <james.guilford@intel.com>
+;     Kirk Yap <kirk.s.yap@intel.com>
+;     Tim Chen <tim.c.chen@linux.intel.com>
+; Transcoded by:
+;     Xiaodong Liu <xiaodong.liu@intel.com>
+;
+; This software is available to you under the OpenIB.org BSD license
+; below:
+;
+;     Redistribution and use in source and binary forms, with or
+;     without modification, are permitted provided that the following
+;     conditions are met:
+;
+;      - Redistributions of source code must retain the above
+;        copyright notice, this list of conditions and the following
+;        disclaimer.
+;
+;      - Redistributions in binary form must reproduce the above
+;        copyright notice, this list of conditions and the following
+;        disclaimer in the documentation and/or other materials
+;        provided with the distribution.
+;
+; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+; SOFTWARE.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+%xdefine X0 xmm4
+%xdefine X1 xmm5
+%xdefine X2 xmm6
+%xdefine X3 xmm7
+
+%xdefine XTMP0 xmm0
+%xdefine XTMP1 xmm1
+%xdefine XTMP2 xmm2
+%xdefine XTMP3 xmm3
+%xdefine XTMP4 xmm8
+%xdefine XFER xmm9
+
+%define SHUF_00BA xmm10      ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm11      ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm12
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR	arg0	; rdi or rcx
+%define NBLK	arg1	; rsi or rdx
+%define IDX	r8	; local variable -- consistent with caller
+%define NLANX4	r10	; consistent with caller, should be r10
+
+%define TMGR r9	; data pointer stored in stack named _TMGR
+%define INP r9	; data pointer stored in stack named _INP
+%define SRND r9	; clobbers INP
+%define TMP r9	; local variable -- assistant to address digest
+
+%xdefine TBL rbp
+%xdefine c ecx
+%xdefine d esi
+%xdefine e edx
+%xdefine a eax
+%xdefine b ebx
+
+%xdefine f edi
+%xdefine g r12d
+%xdefine h r11d
+
+%xdefine y0 r13d
+%xdefine y1 r14d
+%xdefine y2 r15d
+
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define _STACK_ALIGN_SIZE 8	; 0 or 8 depends on pushes
+%define _INP_END_SIZE 8
+%define _INP_SIZE 8
+%define _TMGR_SIZE 8
+%define _XFER_SIZE 16
+%define _XMM_SAVE_SIZE 0
+%define _GPR_SAVE_SIZE 8*9	;rbx, rdx, rbp, (rdi, rsi), r12~r15
+
+%define _STACK_ALIGN 0
+%define _INP_END (_STACK_ALIGN  + _STACK_ALIGN_SIZE)
+%define _INP (_INP_END  + _INP_END_SIZE)
+%define _TMGR (_INP + _INP_SIZE)
+%define _XFER (_TMGR + _TMGR_SIZE)
+%define _XMM_SAVE (_XFER + _XFER_SIZE)
+%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE)
+%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE)
+
+;; assume buffers not aligned
+%define    MOVDQ movdqu
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+        add     %2, %1 ;changed
+        mov     %1, %2 ;changed
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+        MOVDQ %1, %2 ;changed
+        pshufb %1, %3 ;changed
+%endmacro
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endmacro
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endmacro
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+	;; compute s0 four at a time and s1 two at a time
+	;; compute W[-16] + W[-7] 4 at a time
+	movdqa  XTMP0, X3
+	mov     y0, e 			; y0 = e
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	mov     y1, a                   ; y1 = a
+	palignr XTMP0, X2, 4            ; XTMP0 = W[-7]
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	mov     y2, f                   ; y2 = f
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	movdqa  XTMP1, X1
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	xor     y2, g                   ; y2 = f^g
+	paddd   XTMP0, X0               ; XTMP0 = W[-7] + W[-16]
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     y2, e                   ; y2 = (f^g)&e
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	;; compute s0
+	palignr XTMP1, X0, 4            ; XTMP1 = W[-15]
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	movdqa  XTMP2, XTMP1            ; XTMP2 = W[-15]
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y2, y0                  ; y2 = S1 + CH
+	add     y2 , [rsp + _XFER]      ; y2 = k + w + S1 + CH
+	movdqa  XTMP3, XTMP1            ; XTMP3 = W[-15]
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pslld   XTMP1, (32-7)           ;
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	psrld   XTMP2, 7                ;
+	and     y0, b                   ; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	por     XTMP1, XTMP2            ; XTMP1 = W[-15] ror 7
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP2, XTMP3            ; XTMP2 = W[-15]
+	mov     y0, e                   ; y0 = e
+	mov     y1, a                   ; y1 = a
+	movdqa  XTMP4, XTMP3            ; XTMP4 = W[-15]
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	mov     y2, f                   ; y2 = f
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	pslld   XTMP3, (32-18)          ;
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     y2, g                   ; y2 = f^g
+	psrld   XTMP2, 18               ;
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     y2, e                   ; y2 = (f^g)&e
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP1, XTMP3
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	psrld   XTMP4, 3                ; XTMP4 = W[-15] >> 3
+	add     y2, y0                  ; y2 = S1 + CH
+	add     y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	pxor    XTMP1, XTMP2            ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pxor    XTMP1, XTMP4            ; XTMP1 = s0
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	;; compute low s1
+	pshufd  XTMP2, X3, 11111010B    ; XTMP2 = W[-2] {BBAA}
+	and     y0, b 			; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	paddd   XTMP0, XTMP1            ; XTMP0 = W[-16] + W[-7] + s0
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP3, XTMP2            ; XTMP3 = W[-2] {BBAA}
+	mov     y0, e                   ; y0 = e
+	mov     y1, a                   ; y1 = a
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	movdqa  XTMP4, XTMP2            ; XTMP4 = W[-2] {BBAA}
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	mov     y2, f                   ; y2 = f
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   XTMP2, 17               ; XTMP2 = W[-2] ror 17 {xBxA}
+	xor     y2, g                   ; y2 = f^g
+	psrlq   XTMP3, 19               ; XTMP3 = W[-2] ror 19 {xBxA}
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     y2, e                   ; y2 = (f^g)&e
+	psrld   XTMP4, 10               ; XTMP4 = W[-2] >> 10 {BBAA}
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP2, XTMP3
+	add     y2, y0                  ; y2 = S1 + CH
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH
+	pxor    XTMP4, XTMP2            ; XTMP4 = s1 {xBxA}
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pshufb  XTMP4, SHUF_00BA        ; XTMP4 = s1 {00BA}
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	paddd   XTMP0, XTMP4            ; XTMP0 = {..., ..., W[1], W[0]}
+	and     y0, b                   ; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	;; compute high s1
+	pshufd  XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA}
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP3, XTMP2            ; XTMP3 = W[-2] {DDCC}
+	mov     y0, e                   ; y0 = e
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	mov     y1, a                   ; y1 = a
+	movdqa  X0, XTMP2               ; X0    = W[-2] {DDCC}
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	mov     y2, f                   ; y2 = f
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   XTMP2, 17               ; XTMP2 = W[-2] ror 17 {xDxC}
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	xor     y2, g                   ; y2 = f^g
+	psrlq   XTMP3, 19               ; XTMP3 = W[-2] ror 19 {xDxC}
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25
+	and     y2, e                   ; y2 = (f^g)&e
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	psrld   X0, 10                  ; X0 = W[-2] >> 10 {DDCC}
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	pxor    XTMP2, XTMP3            ;
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+	add     y2, y0                  ; y2 = S1 + CH
+	add     y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH
+	pxor    X0, XTMP2               ; X0 = s1 {xDxC}
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pshufb  X0, SHUF_DC00           ; X0 = s1 {DC00}
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	paddd   X0, XTMP0               ; X0 = {W[3], W[2], W[1], W[0]}
+	and     y0, b                   ; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	rotate_Xs
+%endmacro
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+	mov     y0, e                 ; y0 = e
+	ror     y0, (25-11)           ; y0 = e >> (25-11)
+	mov     y1, a                 ; y1 = a
+	xor     y0, e                 ; y0 = e ^ (e >> (25-11))
+	ror     y1, (22-13)           ; y1 = a >> (22-13)
+	mov     y2, f                 ; y2 = f
+	xor     y1, a                 ; y1 = a ^ (a >> (22-13)
+	ror     y0, (11-6)            ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     y2, g                 ; y2 = f^g
+	xor     y0, e                 ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror     y1, (13-2)            ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and     y2, e                 ; y2 = (f^g)&e
+	xor     y1, a                 ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     y0, 6                 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     y2, g                 ; y2 = CH = ((f^g)&e)^g
+	add     y2, y0                ; y2 = S1 + CH
+	ror     y1, 2                 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	%xdefine offset (%1 * 4 + _XFER)
+	add     y2, [rsp + offset]    ; y2 = k + w + S1 + CH
+	mov     y0, a                 ; y0 = a
+	add     h, y2                 ; h = h + S1 + CH + k + w
+	mov     y2, a                 ; y2 = a
+	or      y0, c                 ; y0 = a|c
+	add     d, h                  ; d = d + h + S1 + CH + k + w
+	and     y2, c                 ; y2 = a&c
+	and     y0, b                 ; y0 = (a|c)&b
+	add     h, y1                 ; h = h + S1 + CH + k + w + S0
+	or      y0, y2 		      ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0 		      ; h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs, xmm0-xmm12
+;	{rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+mk_global sha256_opt_x1, function, internal
+sha256_opt_x1:
+	endbranch
+	sub     rsp, STACK_SIZE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	mov     [rsp + _GPR_SAVE + 8*3], rsi
+	; caller has already stored XMM6~10
+%endif
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+	mov     [rsp + _GPR_SAVE + 8*8], rdx
+
+	shl     NBLK, 6 		 ; convert to bytes
+	jz      done_hash
+
+	; detach idx from nlanx4
+	mov	IDX, NLANX4
+	shr	NLANX4, 8
+	and	IDX, 0xff
+
+	mov     [rsp + _TMGR], MGR
+	;; Load input pointers
+	mov     INP, [MGR + _data_ptr + IDX*8]
+	mov     [rsp + _INP], INP
+	;; nblk is used to indicate data end
+	add     NBLK, INP
+	mov     [rsp + _INP_END], NBLK  ; pointer to end of data
+
+
+	mov     TMGR, [rsp + _TMGR]
+	;; load initial digest
+	lea	TMP, [TMGR + 4*IDX]
+	mov     a, [TMP + 0*NLANX4]
+	mov     b, [TMP + 1*NLANX4]
+	mov     c, [TMP + 2*NLANX4]
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	mov     d, [TMP + 1*NLANX4]
+	mov     e, [TMP + 2*NLANX4]
+	mov     g, [TMP + 4*NLANX4]
+	lea	TMP, [TMP + 1*NLANX4]	; MGR + 4*IDX + 3*NLANX4
+	mov     f, [TMP + 2*NLANX4]
+	mov     h, [TMP + 4*NLANX4]
+
+	movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+	movdqa  SHUF_00BA, [_SHUF_00BA]
+	movdqa  SHUF_DC00, [_SHUF_DC00]
+
+	mov     INP, [rsp + _INP]
+loop0:
+	lea     TBL, [K256]
+
+	;; byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, [INP + 0*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, [INP + 1*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, [INP + 2*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, [INP + 3*16], BYTE_FLIP_MASK
+
+	mov     [rsp + _INP], INP
+
+	;; schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     SRND, 3
+
+loop1:
+	movdqa  XFER, [TBL]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  XFER, [TBL + 1*16]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  XFER, [TBL + 2*16]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  XFER, [TBL + 3*16]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	add     TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     SRND, 1
+	jne     loop1
+
+	mov     SRND, 2
+loop2:
+	paddd   X0, [TBL]
+	movdqa  [rsp + _XFER], X0
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+	paddd   X1, [TBL + 1*16]
+	movdqa  [rsp + _XFER], X1
+	add     TBL, 2*16
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	movdqa  X0, X2
+	movdqa  X1, X3
+
+	sub     SRND, 1
+	jne     loop2
+
+	; write out digests
+	mov     TMGR, [rsp + _TMGR]
+	lea	TMP, [TMGR + 4*IDX]
+	addm    a, [TMP + 0*NLANX4]
+	addm    b, [TMP + 1*NLANX4]
+	addm    c, [TMP + 2*NLANX4]
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	addm    d, [TMP + 1*NLANX4]
+	addm    e, [TMP + 2*NLANX4]
+	addm    g, [TMP + 4*NLANX4]
+	lea	TMP, [TMP + 1*NLANX4]	; MGR + 4*IDX + 3*NLANX4
+	addm    f, [TMP + 2*NLANX4]
+	addm    h, [TMP + 4*NLANX4]
+
+	mov     INP, [rsp + _INP]
+	add     INP, 64
+	cmp     INP, [rsp + _INP_END]
+	jne     loop0
+
+done_hash:
+	mov     MGR, [rsp + _TMGR]
+
+	mov     rdx, [rsp + _GPR_SAVE + 8*8]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rsi, [rsp + _GPR_SAVE + 8*3]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbp, [rsp + _GPR_SAVE + 8*1]
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	add     rsp, STACK_SIZE
+
+	ret
+
+section .data
+align 64
+K256:
+        DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	DQ 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:
+	DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:
+	DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
new file mode 100644
index 000000000..c3515dc52
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
@@ -0,0 +1,204 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define H0 0x6a09e667
+#define H1 0xbb67ae85
+#define H2 0x3c6ef372
+#define H3 0xa54ff53a
+#define H4 0x510e527f
+#define H5 0x9b05688c
+#define H6 0x1f83d9ab
+#define H7 0x5be0cd19
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be32(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+static void OPT_FIX sha256_single(const uint8_t * data, uint32_t digest[]);
+
+void sha256_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+	digest[0] = H0;
+	digest[1] = H1;
+	digest[2] = H2;
+	digest[3] = H3;
+	digest[4] = H4;
+	digest[5] = H5;
+	digest[6] = H6;
+	digest[7] = H7;
+
+	i = len;
+	while (i >= SHA256_BLOCK_SIZE) {
+		sha256_single(input_data, digest);
+		input_data += SHA256_BLOCK_SIZE;
+		i -= SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA256_BLOCK_SIZE;
+	else
+		i = SHA256_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	sha256_single(buf, digest);
+	if (i == 2 * SHA256_BLOCK_SIZE)
+		sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+void sha256_single(const uint8_t * data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2;
+	uint32_t w[16];
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+	step(1, h, a, b, c, d, e, f, g, 0x71374491);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+	step(31, b, c, d, e, f, g, h, a, 0x14292967);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+	step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+	step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}