34 files changed, 7725 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
new file mode 100644
index 000000000..6fc22d132
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
@@ -0,0 +1,91 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions 
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += sha512_mb/sha512_ctx_sse.c \
+	sha512_mb/sha512_ctx_avx.c \
+	sha512_mb/sha512_ctx_avx2.c \
+	sha512_mb/sha512_ctx_sb_sse4.c
+
+lsrc += sha512_mb/sha512_mb_mgr_init_sse.c \
+	sha512_mb/sha512_mb_mgr_init_avx2.c \
+	sha512_mb/sha512_sb_mgr_init_sse4.c
+
+
+lsrc += sha512_mb/sha512_mb_mgr_submit_sse.asm \
+	sha512_mb/sha512_mb_mgr_submit_avx.asm \
+	sha512_mb/sha512_mb_mgr_submit_avx2.asm \
+	sha512_mb/sha512_mb_mgr_flush_sse.asm \
+	sha512_mb/sha512_mb_mgr_flush_avx.asm \
+	sha512_mb/sha512_mb_mgr_flush_avx2.asm \
+	sha512_mb/sha512_mb_x2_sse.asm \
+	sha512_mb/sha512_mb_x2_avx.asm \
+	sha512_mb/sha512_mb_x4_avx2.asm \
+	sha512_mb/sha512_multibinary.asm \
+	sha512_mb/sha512_sb_mgr_submit_sse4.c \
+	sha512_mb/sha512_sb_mgr_flush_sse4.c \
+	sha512_mb/sha512_sse4.asm
+
+lsrc += sha512_mb/sha512_ctx_avx512.c \
+	sha512_mb/sha512_mb_mgr_init_avx512.c \
+	sha512_mb/sha512_mb_mgr_submit_avx512.asm \
+	sha512_mb/sha512_mb_mgr_flush_avx512.asm \
+	sha512_mb/sha512_mb_x8_avx512.asm
+
+extern_hdrs +=  include/sha512_mb.h \
+		include/multi_buffer.h
+
+other_src += 	include/datastruct.asm \
+		sha512_mb/sha512_job.asm \
+		sha512_mb/sha512_mb_mgr_datastruct.asm \
+		include/reg_sizes.asm \
+		sha512_mb/sha512_ref.c \
+		include/memcpy_inline.h \
+		include/memcpy.asm \
+		include/intrinreg.h
+
+check_tests +=	sha512_mb/sha512_mb_test \
+		sha512_mb/sha512_mb_rand_test \
+		sha512_mb/sha512_mb_rand_update_test
+
+unit_tests   += sha512_mb/sha512_mb_rand_ssl_test
+
+perf_tests +=   sha512_mb/sha512_mb_vs_ossl_perf
+
+sha512_mb_rand_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_update_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_update_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha512_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
new file mode 100644
index 000000000..4e5173155
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx_slver_02020166;
+struct slver sha512_ctx_mgr_init_avx_slver = { 0x0166, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_submit_avx_slver_02020167;
+struct slver sha512_ctx_mgr_submit_avx_slver = { 0x0167, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_flush_avx_slver_02020168;
+struct slver sha512_ctx_mgr_flush_avx_slver = { 0x0168, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
new file mode 100644
index 000000000..d1b7d7270
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx2(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					    const void *buffer, uint32_t len,
+					    HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx2(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx2_slver_04020169;
+struct slver sha512_ctx_mgr_init_avx2_slver = { 0x0169, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_submit_avx2_slver_04020170;
+struct slver sha512_ctx_mgr_submit_avx2_slver = { 0x0170, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_flush_avx2_slver_04020171;
+struct slver sha512_ctx_mgr_flush_avx2_slver = { 0x0171, 0x02, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
new file mode 100644
index 000000000..f99116eb1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
@@ -0,0 +1,259 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx512(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					      const void *buffer, uint32_t len,
+					      HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx512(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx512_slver_0600016a;
+struct slver sha512_ctx_mgr_init_avx512_slver = { 0x016a, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_submit_avx512_slver_0600016b;
+struct slver sha512_ctx_mgr_submit_avx512_slver = { 0x016b, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_flush_avx512_slver_0600016c;
+struct slver sha512_ctx_mgr_flush_avx512_slver = { 0x016c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
new file mode 100644
index 000000000..6b44f075c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_sb_mgr_init_sse4(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sb_sse4(SHA512_HASH_CTX_MGR * mgr,
+					       SHA512_HASH_CTX * ctx, const void *buffer,
+					       uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+									    &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_flush_sse4(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+									    &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sb_sse4_slver_05020172;
+struct slver sha512_ctx_mgr_init_sb_sse4_slver = { 0x0172, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver_05020173;
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver = { 0x0173, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver_05020174;
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver = { 0x0174, 0x02, 0x05 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
new file mode 100644
index 000000000..b4dfe5332
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sse(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_sse(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sse_slver_00020163;
+struct slver sha512_ctx_mgr_init_sse_slver = { 0x0163, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_submit_sse_slver_00020164;
+struct slver sha512_ctx_mgr_submit_sse_slver = { 0x0164, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_flush_sse_slver_00020165;
+struct slver sha512_ctx_mgr_flush_sse_slver = { 0x0165, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
new file mode 100644
index 000000000..7f2bdae48
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
@@ -0,0 +1,54 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN		0
+%define STS_BEING_PROCESSED	1
+%define STS_COMPLETED		2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS	; JOB_SHA512
+
+;;;	name				size	align
+FIELD	_buffer,			8,	8	; pointer to buffer
+FIELD	_len,				8,	8	; length in bytes
+FIELD	_result_digest,			8*8,	64	; Digest (output)
+FIELD	_status,			4,	4
+FIELD	_user_data,			8,	8
+
+%assign _SHA512_JOB_size	_FIELD_OFFSET
+%assign _SHA512_JOB_align	_STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..d1578109e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
@@ -0,0 +1,72 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; LANE_DATA
+;;;     name            size    align
+FIELD   _job_in_lane,   8,      8       ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align        _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; SHA512_ARGS_X8
+;;;     name            size    align
+FIELD   _digest,        8*8*8,  4      ; transposed digest
+FIELD   _data_ptr,      8*8,    8       ; array of pointers to data
+END_FIELDS
+
+%assign _SHA512_ARGS_X4_size    _FIELD_OFFSET
+%assign _SHA512_ARGS_X4_align   _STRUCT_ALIGN
+%assign _SHA512_ARGS_X8_size    _FIELD_OFFSET
+%assign _SHA512_ARGS_X8_align   _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; MB_MGR
+;;;     name            size    align
+FIELD   _args,          _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
+FIELD   _lens,          8*8,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*8, _LANE_DATA_align
+FIELD   _num_lanes_inuse, 4,    4
+END_FIELDS
+
+%assign _MB_MGR_size    _FIELD_OFFSET
+%assign _MB_MGR_align   _STRUCT_ALIGN
+
+_args_digest    equ     _args + _digest
+_args_data_ptr  equ     _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..33c62773a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
@@ -0,0 +1,218 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+			
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+			
+%define tmp3            arg1
+			
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_avx:function
+sha512_mb_mgr_flush_avx:
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	bt      unused_lanes, 16+7
+	jc      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0        
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+	
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..61c25aaef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
@@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+			
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+			
+%define tmp3            arg1
+			
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+struc stack_frame 
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.xmm
+%define _GPR_SAVE       stack_frame.gpr
+%define STACK_SPACE     stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx2(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_avx2:function
+sha512_mb_mgr_flush_avx2:
+
+	mov     rax, rsp
+	
+	sub     rsp, STACK_SPACE
+	and     rsp, ~31
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	bt      unused_lanes, 32+7
+	jc      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     lens2, [state + _lens + 2*8]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     lens3, [state + _lens + 3*8]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0        
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+	mov     [state + _lens + 2*8], lens2
+	mov     [state + _lens + 3*8], lens3
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x4_avx2
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+	
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..c16517821
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
@@ -0,0 +1,266 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define num_lanes_inuse r9
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define num_lanes_inuse r9
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*8
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.xmm
+%define _GPR_SAVE       stack_frame.gpr
+%define STACK_SPACE     stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx512(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_avx512:function
+sha512_mb_mgr_flush_avx512:
+
+	mov     rax, rsp
+
+	sub     rsp, STACK_SPACE
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqu  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqu  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqu  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqu  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqu  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqu  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqu  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqu  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqu  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqu  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+%assign I 1
+%rep 7
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+	vmovdqu ymm0, [state + _lens + 0*32]	; ymm0 has {D,d,C,c,B,b,A,a}
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminuq ymm2, ymm0, ymm1	; ymm2 has {D,i,C,i,B,i,A,i}
+	vpalignr ymm3, ymm3, ymm2, 8	; ymm3 has {x,i,D,i,x,i,B,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has {x,i,F,i,x,i,E,i}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,i,x,i,x,i,F,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has min value in high dword
+
+	vmovq   idx, xmm2
+	mov     len2, idx
+	and     idx, 0xF
+	shr     len2, 32		; SHA512 blocksize is 1024bit
+	jz      len_is_0
+
+	vperm2i128 ymm2, ymm2, ymm2, 0	; ymm2 has {x,x,E,i,x,x,E,i}
+	vpand   ymm2, ymm2, [rel clear_low_nibble]	; ymm2 has {0,0,E,0,0,0,E,0}
+	vpshufd ymm2, ymm2, 0x44	; ymm2 has {E,0,E,0,E,0,E,0}
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x8_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*64]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*64], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*64]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*64], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*64]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*64], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*64]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*64], 1
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqu  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqu  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqu  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqu  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqu  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqu  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqu  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqu  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqu  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=32
+
+align 32
+clear_low_nibble:	; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_flush_avx512
+no_sha512_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..602d95330
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
@@ -0,0 +1,221 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+			
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+			
+%define tmp3            arg1
+			
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_sse(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_sse:function
+sha512_mb_mgr_flush_sse:
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+
+	mov     unused_lanes, [state + _unused_lanes]
+	bt      unused_lanes, 16+7
+	jc      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0        
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_sse
+	; state and idx are intact
+
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	movq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	pinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	movq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	pinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	movq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	pinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	movq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	pinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+	movdqa  [job_rax + _result_digest + 2*16], xmm2
+	movdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+	
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..da57e05d5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
@@ -0,0 +1,44 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx2(SHA512_MB_JOB_MGR * state)
+{
+	unsigned int j;
+
+	state->lens[0] = 0;
+	state->lens[1] = 1;
+	state->lens[2] = 2;
+	state->lens[3] = 3;
+	state->unused_lanes = 0xFF03020100;
+	for (j = 0; j < SHA512_X4_LANES; j++) {
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..2ce996cf1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
@@ -0,0 +1,42 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx512(SHA512_MB_JOB_MGR * state)
+{
+	unsigned int j;
+
+	state->unused_lanes = 0x0706050403020100;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA512_MAX_LANES; j++) {
+		state->lens[j] = j;	// sha512_mb uses low 32bit of lens to hold idx exclusively
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
new file mode 100644
index 000000000..d646d88fd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
@@ -0,0 +1,42 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_sse(SHA512_MB_JOB_MGR * state)
+{
+	unsigned int j;
+
+	state->lens[0] = 0;
+	state->lens[1] = 1;
+	state->unused_lanes = 0xFF0100;
+	for (j = 0; j < SHA512_MIN_LANES; j++) {
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..d9ef88474
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+			
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+			
+%define size_offset     rdi
+%define tmp2            rdi
+			
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+			
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+			
+%define extra_blocks    r8
+%define lens0           r8
+			
+%define tmp             r9
+%define lens1           r9
+			
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame 
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_avx:function
+sha512_mb_mgr_submit_avx:
+
+	mov	rax, rsp
+	
+	sub     rsp, STACK_SPACE
+	and	rsp, ~31
+
+	mov	[rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqa  [rsp + 16*0], xmm6
+	vmovdqa  [rsp + 16*1], xmm7
+	vmovdqa  [rsp + 16*2], xmm8
+	vmovdqa  [rsp + 16*3], xmm9
+	vmovdqa  [rsp + 16*4], xmm10
+	vmovdqa  [rsp + 16*5], xmm11
+	vmovdqa  [rsp + 16*6], xmm12
+	vmovdqa  [rsp + 16*7], xmm13
+	vmovdqa  [rsp + 16*8], xmm14
+	vmovdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+	; Load digest words from result_digest
+	vmovdqa	xmm0, [job + _result_digest + 0*16]
+	vmovdqa	xmm1, [job + _result_digest + 1*16]
+	vmovdqa	xmm2, [job + _result_digest + 2*16]
+	vmovdqa	xmm3, [job + _result_digest + 3*16]
+	vmovq    [state + _args_digest + 8*lane + 0*32], xmm0
+	vpextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
+	vmovq    [state + _args_digest + 8*lane + 2*32], xmm1
+	vpextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
+	vmovq    [state + _args_digest + 8*lane + 4*32], xmm2
+	vpextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
+	vmovq    [state + _args_digest + 8*lane + 6*32], xmm3
+	vpextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	cmp     unused_lanes, 0xff
+	jne     return_null
+
+start_loop:
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+       
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_avx
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+	
+	mov     job_rax, [lane_data + _job_in_lane]
+	
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + 16*0]
+	vmovdqa  xmm7,  [rsp + 16*1]
+	vmovdqa  xmm8,  [rsp + 16*2]
+	vmovdqa  xmm9,  [rsp + 16*3]
+	vmovdqa  xmm10, [rsp + 16*4]
+	vmovdqa  xmm11, [rsp + 16*5]
+	vmovdqa  xmm12, [rsp + 16*6]
+	vmovdqa  xmm13, [rsp + 16*7]
+	vmovdqa  xmm14, [rsp + 16*8]
+	vmovdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..e39b8df4d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
@@ -0,0 +1,266 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+			
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+			
+%define size_offset     rdi
+%define tmp2            rdi
+			
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+			
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+			
+%define extra_blocks    r8
+%define lens0           r8
+			
+%define tmp             r9
+%define lens1           r9
+			
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame 
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx2(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_avx2:function
+sha512_mb_mgr_submit_avx2:
+
+	mov     rax, rsp
+	
+	sub     rsp, STACK_SPACE
+	and     rsp, ~31
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqa  [rsp + 16*0], xmm6
+	vmovdqa  [rsp + 16*1], xmm7
+	vmovdqa  [rsp + 16*2], xmm8
+	vmovdqa  [rsp + 16*3], xmm9
+	vmovdqa  [rsp + 16*4], xmm10
+	vmovdqa  [rsp + 16*5], xmm11
+	vmovdqa  [rsp + 16*6], xmm12
+	vmovdqa  [rsp + 16*7], xmm13
+	vmovdqa  [rsp + 16*8], xmm14
+	vmovdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+	; Load digest words from result_digest
+	vmovdqa  xmm0, [job + _result_digest + 0*16]
+	vmovdqa  xmm1, [job + _result_digest + 1*16]
+	vmovdqa  xmm2, [job + _result_digest + 2*16]
+	vmovdqa  xmm3, [job + _result_digest + 3*16]
+	vmovq    [state + _args_digest + 8*lane + 0*32], xmm0
+	vpextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
+	vmovq    [state + _args_digest + 8*lane + 2*32], xmm1
+	vpextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
+	vmovq    [state + _args_digest + 8*lane + 4*32], xmm2
+	vpextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
+	vmovq    [state + _args_digest + 8*lane + 6*32], xmm3
+	vpextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	cmp     unused_lanes, 0xff
+	jne     return_null
+
+start_loop:
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     lens2, [state + _lens + 2*8]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     lens3, [state + _lens + 3*8]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+	mov     [state + _lens + 2*8], lens2
+	mov     [state + _lens + 3*8], lens3
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x4_avx2
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+	
+	mov     job_rax, [lane_data + _job_in_lane]
+	
+
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + 16*0]
+	vmovdqa  xmm7,  [rsp + 16*1]
+	vmovdqa  xmm8,  [rsp + 16*2]
+	vmovdqa  xmm9,  [rsp + 16*3]
+	vmovdqa  xmm10, [rsp + 16*4]
+	vmovdqa  xmm11, [rsp + 16*5]
+	vmovdqa  xmm12, [rsp + 16*6]
+	vmovdqa  xmm13, [rsp + 16*7]
+	vmovdqa  xmm14, [rsp + 16*8]
+	vmovdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..59f359f1f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
@@ -0,0 +1,279 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define num_lanes_inuse r9
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*8
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx512(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_avx512:function
+sha512_mb_mgr_submit_avx512:
+
+	mov     rax, rsp
+
+	sub     rsp, STACK_SPACE
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+	mov     [rsp + _XMM_SAVE + 8*5], r13
+	mov     [rsp + _XMM_SAVE + 8*6], r14
+	mov     [rsp + _XMM_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqu  [rsp + 16*0], xmm6
+	vmovdqu  [rsp + 16*1], xmm7
+	vmovdqu  [rsp + 16*2], xmm8
+	vmovdqu  [rsp + 16*3], xmm9
+	vmovdqu  [rsp + 16*4], xmm10
+	vmovdqu  [rsp + 16*5], xmm11
+	vmovdqu  [rsp + 16*6], xmm12
+	vmovdqu  [rsp + 16*7], xmm13
+	vmovdqu  [rsp + 16*8], xmm14
+	vmovdqu  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+	; Load digest words from result_digest
+	vmovdqa  xmm0, [job + _result_digest + 0*16]
+	vmovdqa  xmm1, [job + _result_digest + 1*16]
+	vmovdqa  xmm2, [job + _result_digest + 2*16]
+	vmovdqa  xmm3, [job + _result_digest + 3*16]
+	vmovq    [state + _args_digest + 8*lane + 0*64], xmm0
+	vpextrq  [state + _args_digest + 8*lane + 1*64], xmm0, 1
+	vmovq    [state + _args_digest + 8*lane + 2*64], xmm1
+	vpextrq  [state + _args_digest + 8*lane + 3*64], xmm1, 1
+	vmovq    [state + _args_digest + 8*lane + 4*64], xmm2
+	vpextrq  [state + _args_digest + 8*lane + 5*64], xmm2, 1
+	vmovq    [state + _args_digest + 8*lane + 6*64], xmm3
+	vpextrq  [state + _args_digest + 8*lane + 7*64], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        add     num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+        cmp     num_lanes_inuse, 8
+	jne     return_null
+
+start_loop:
+	; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+	vmovdqu ymm0, [state + _lens + 0*32]	; ymm0 has {D,d,C,c,B,b,A,a}
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminuq ymm2, ymm0, ymm1	; ymm2 has {D,i,C,i,B,i,A,i}
+	vpalignr ymm3, ymm3, ymm2, 8	; ymm3 has {x,i,D,i,x,i,B,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has {x,i,F,i,x,i,E,i}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,i,x,i,x,i,F,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has min value in high dword
+
+	vmovq   idx, xmm2
+	mov     len2, idx
+	and     idx, 0xF
+	shr     len2, 32
+	jz      len_is_0
+
+
+	vperm2i128 ymm2, ymm2, ymm2, 0	; ymm2 has {x,x,E,i,x,x,E,i}
+	vpand   ymm2, ymm2, [rel clear_low_nibble]	; ymm2 has {0,0,E,0,0,0,E,0}
+	vpshufd ymm2, ymm2, 0x44	; ymm2 has {E,0,E,0,E,0,E,0}
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x8_avx512
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+
+
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*64]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*64], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*64]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*64], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*64]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*64], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*64]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*64], 1
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu  xmm6,  [rsp + 16*0]
+	vmovdqu  xmm7,  [rsp + 16*1]
+	vmovdqu  xmm8,  [rsp + 16*2]
+	vmovdqu  xmm9,  [rsp + 16*3]
+	vmovdqu  xmm10, [rsp + 16*4]
+	vmovdqu  xmm11, [rsp + 16*5]
+	vmovdqu  xmm12, [rsp + 16*6]
+	vmovdqu  xmm13, [rsp + 16*7]
+	vmovdqu  xmm14, [rsp + 16*8]
+	vmovdqu  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov     r13, [rsp + _XMM_SAVE + 8*5]
+	mov     r14, [rsp + _XMM_SAVE + 8*6]
+	mov     r15, [rsp + _XMM_SAVE + 8*7]
+
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=32
+
+align 32
+clear_low_nibble:	; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_submit_avx512
+no_sha512_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..8b630a4da
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+			
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+			
+%define size_offset     rdi
+%define tmp2            rdi
+			
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+			
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+			
+%define extra_blocks    r8
+%define lens0           r8
+			
+%define tmp             r9
+%define lens1           r9
+			
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame 
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_sse(SHA512_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_sse:function
+sha512_mb_mgr_submit_sse:
+
+	mov	rax, rsp
+	
+	sub     rsp, STACK_SPACE
+	and	rsp, ~31
+
+	mov	[rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	movdqa  [rsp + 16*0], xmm6
+	movdqa  [rsp + 16*1], xmm7
+	movdqa  [rsp + 16*2], xmm8
+	movdqa  [rsp + 16*3], xmm9
+	movdqa  [rsp + 16*4], xmm10
+	movdqa  [rsp + 16*5], xmm11
+	movdqa  [rsp + 16*6], xmm12
+	movdqa  [rsp + 16*7], xmm13
+	movdqa  [rsp + 16*8], xmm14
+	movdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa	xmm0, [job + _result_digest + 0*16]
+	movdqa	xmm1, [job + _result_digest + 1*16]
+	movdqa	xmm2, [job + _result_digest + 2*16]
+	movdqa	xmm3, [job + _result_digest + 3*16]
+	movq    [state + _args_digest + 8*lane + 0*32], xmm0
+	pextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
+	movq    [state + _args_digest + 8*lane + 2*32], xmm1
+	pextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
+	movq    [state + _args_digest + 8*lane + 4*32], xmm2
+	pextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
+	movq    [state + _args_digest + 8*lane + 6*32], xmm3
+	pextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	cmp     unused_lanes, 0xff
+	jne     return_null
+
+start_loop:
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+       
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_sse
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+	
+	mov     job_rax, [lane_data + _job_in_lane]
+
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	movq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	pinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	movq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	pinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	movq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	pinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	movq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	pinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+	movdqa  [job_rax + _result_digest + 2*16], xmm2
+	movdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + 16*0]
+	movdqa  xmm7,  [rsp + 16*1]
+	movdqa  xmm8,  [rsp + 16*2]
+	movdqa  xmm9,  [rsp + 16*3]
+	movdqa  xmm10, [rsp + 16*4]
+	movdqa  xmm11, [rsp + 16*5]
+	movdqa  xmm12, [rsp + 16*6]
+	movdqa  xmm13, [rsp + 16*7]
+	movdqa  xmm14, [rsp + 16*8]
+	movdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
new file mode 100644
index 000000000..edb57bc33
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
@@ -0,0 +1,171 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+uint64_t byteswap64(uint64_t x)
+{
+#if defined (__ICC)
+	return _bswap64(x);
+#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+	return __builtin_bswap64(x);
+#else
+	return (((x & (0xffull << 0)) << 56)
+		| ((x & (0xffull << 8)) << 40)
+		| ((x & (0xffull << 16)) << 24)
+		| ((x & (0xffull << 24)) << 8)
+		| ((x & (0xffull << 32)) >> 8)
+		| ((x & (0xffull << 40)) >> 24)
+		| ((x & (0xffull << 48)) >> 40)
+		| ((x & (0xffull << 56)) >> 56));
+#endif
+}
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+
+	printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	sha512_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// SSL test
+		SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+
+		// sb_sha512 test
+		sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha512_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    byteswap64(((uint64_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       byteswap64(((uint64_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha512_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Ramdom buffer with ramdom len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run SSL test
+			SHA512(bufs[i], lens[i], digest_ssl[i]);
+
+			// Run sb_sha512 test
+			sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha512_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    byteswap64(((uint64_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       byteswap64(((uint64_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha512_ssl rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
new file mode 100644
index 000000000..a1b805737
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
@@ -0,0 +1,197 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	uint8_t *tmp_buf;
+
+	printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	sha512_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+		// Run sb_sha512 test
+		sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha512_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%016lX <=> 0x%016lX \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail) {
+		printf("Test failed function check %d\n", fail);
+		return fail;
+	}
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha512_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Use buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run reference test
+			sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+			// Run sha512_mb test
+			sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha512_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail "
+					       "0x%016lX <=> 0x%016lX\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	// Test at the end of buffer
+	jobs = rand() % TEST_BUFS;
+	tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+	if (!tmp_buf) {
+		printf("malloc failed, end test aborted.\n");
+		return 1;
+	}
+
+	rand_buffer(tmp_buf, jobs);
+
+	sha512_ctx_mgr_init(mgr);
+
+	// Extend to the end of allocated buffer to construct jobs
+	for (i = 0; i < jobs; i++) {
+		bufs[i] = (uint8_t *) & tmp_buf[i];
+		lens[i] = jobs - i;
+
+		// Reference test
+		sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// sb_sha512 test
+		sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	while (sha512_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < jobs; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("End test failed at offset %d - result: 0x%016lX"
+				       ", ref: 0x%016lX\n", i, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	putchar('.');
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha512 rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
new file mode 100644
index 000000000..a05168b70
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
@@ -0,0 +1,294 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE		13*SHA512_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS 	(TEST_LEN/(16*SHA512_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, fail = 0;
+	int len_done, len_rem, len_rand;
+	unsigned char *bufs[TEST_BUFS];
+	unsigned char *buf_ptr[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int joblen, jobs, t;
+
+	printf("multibinary_sha512_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	sha512_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocte and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		buf_ptr[i] = bufs[i];
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+	}
+
+	// Run sb_sha512 tests
+	for (i = 0; i < TEST_BUFS;) {
+		len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_done == 0)
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+		else if (len_rem <= UPDATE_SIZE)
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		// Add jobs while available or finished
+		if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+			i++;
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+	}
+
+	// Start flushing finished jobs, end on last flushed
+	ctx = sha512_ctx_mgr_flush(mgr);
+	while (ctx) {
+		if (hash_ctx_complete(ctx)) {
+			debug_char('-');
+			ctx = sha512_ctx_mgr_flush(mgr);
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+
+		len_done = (int)((unsigned long)buf_ptr[i]
+				 - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_rem <= UPDATE_SIZE)
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		if (ctx == NULL)
+			ctx = sha512_ctx_mgr_flush(mgr);
+	}
+
+	// Check digests
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d fail %8lX <=> %8lX",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		for (i = 0; i < jobs; i++) {
+			joblen = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], joblen);
+			lens[i] = joblen;
+			buf_ptr[i] = bufs[i];
+			sha512_ref(bufs[i], digest_ref[i], lens[i]);
+		}
+
+		sha512_ctx_mgr_init(mgr);
+
+		// Run sha512_sb jobs
+		i = 0;
+		while (i < jobs) {
+			// Submit a new job 
+			len_rand = SHA512_BLOCK_SIZE +
+			    SHA512_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+			if (lens[i] > len_rand)
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_FIRST);
+			else
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], lens[i], HASH_ENTIRE);
+
+			// Returned ctx could be:
+			//  - null context (we are just getting started and lanes aren't full yet), or 
+			//  - finished already (an ENTIRE we submitted or a previous LAST is returned), or 
+			//  - an unfinished ctx, we will resubmit
+
+			if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+				i++;
+				continue;
+			} else {
+				// unfinished ctx returned, choose another random update length and submit either 
+				// UPDATE or LAST depending on the amount of buffer remaining 
+				while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+					j = (unsigned long)(ctx->user_data);	// Get index of the returned ctx
+					buf_ptr[j] = bufs[j] + ctx->total_length;
+					len_rand = (rand() % SHA512_BLOCK_SIZE)
+					    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+					len_rem = lens[j] - ctx->total_length;
+
+					if (len_rem <= len_rand)	// submit the rest of the job as LAST
+						ctx = sha512_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rem,
+									    HASH_LAST);
+					else	// submit the random update length as UPDATE
+						ctx = sha512_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rand,
+									    HASH_UPDATE);
+				}	// Either continue submitting any contexts returned here as UPDATE/LAST, or 
+				// go back to submitting new jobs using the index i. 
+
+				i++;
+			}
+		}
+
+		// Start flushing finished jobs, end on last flushed
+		ctx = sha512_ctx_mgr_flush(mgr);
+		while (ctx) {
+			if (hash_ctx_complete(ctx)) {
+				debug_char('-');
+				ctx = sha512_ctx_mgr_flush(mgr);
+				continue;
+			}
+			// Resubmit unfinished job
+			i = (unsigned long)(ctx->user_data);
+			buf_ptr[i] = bufs[i] + ctx->total_length;	// update buffer pointer
+			len_rem = lens[i] - ctx->total_length;
+			len_rand = (rand() % SHA512_BLOCK_SIZE)
+			    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+			debug_char('+');
+			if (len_rem <= len_rand)
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rem, HASH_LAST);
+			else
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_UPDATE);
+
+			if (ctx == NULL)
+				ctx = sha512_ctx_mgr_flush(mgr);
+		}
+
+		// Check result digest
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail %8lX <=> %8lX\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha512_update rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
new file mode 100644
index 000000000..747de43bb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
@@ -0,0 +1,264 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha512_mb.h"
+
+typedef uint64_t DigestSHA512[SHA512_DIGEST_NWORDS];
+
+#define MSGS 8
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "The quick brown fox jumps over the lazy dog";
+static uint8_t msg2[] = "The quick brown fox jumps over the lazy dog.";
+static uint8_t msg3[] = { 0x0a, 0x55, 0xdb, 0 };
+static uint8_t msg4[] = { 0xba, 0xd7, 0xc6, 0x18, 0xf4, 0x5b, 0xe2, 0x07, 0x97, 0x5e, 0 };
+
+static uint8_t msg5[] = {
+	0xb1, 0x71, 0x5f, 0x78, 0x2f, 0xf0, 0x2c, 0x6b, 0x88, 0x93,
+	0x7f, 0x05, 0x41, 0x16, 0
+};
+
+static uint8_t msg6[] = {
+	0xc6, 0xa1, 0x70, 0x93, 0x65, 0x68, 0x65, 0x10, 0x20, 0xed,
+	0xfe, 0x15, 0xdf, 0x80, 0x12, 0xac, 0xda, 0x8d, 0
+};
+
+static uint8_t msg7[] = {
+	0xa8, 0xa3, 0x7d, 0xfc, 0x08, 0x3a, 0xd2, 0xf4, 0x7f, 0xff,
+	0x46, 0x87, 0x38, 0xbf, 0x8b, 0x72, 0x8e, 0xb7, 0xf1, 0x90,
+	0x7e, 0x42, 0x7f, 0xa1, 0x5c, 0xb4, 0x42, 0x4b, 0xc6, 0x85,
+	0xe5, 0x5e, 0xd7, 0xb2, 0x82, 0x5c, 0x9c, 0x60, 0xb8, 0x39,
+	0xcc, 0xc2, 0xfe, 0x5f, 0xb3, 0x3e, 0x36, 0xf5, 0x70, 0xcb,
+	0x86, 0x61, 0x60, 0x9e, 0x63, 0x0b, 0xda, 0x05, 0xee, 0x64,
+	0x1d, 0x93, 0x84, 0x28, 0x86, 0x7d, 0x90, 0xe0, 0x07, 0x44,
+	0xa4, 0xaa, 0xd4, 0x94, 0xc9, 0x3c, 0x5f, 0x6d, 0x13, 0x27,
+	0x87, 0x80, 0x78, 0x59, 0x0c, 0xdc, 0xe1, 0xe6, 0x47, 0xc9,
+	0x82, 0x08, 0x18, 0xf4, 0x67, 0x64, 0x1f, 0xcd, 0x50, 0x8e,
+	0x2f, 0x2e, 0xbf, 0xd0, 0xff, 0x3d, 0x4f, 0x27, 0x23, 0x93,
+	0x47, 0x8f, 0x3b, 0x9e, 0x6f, 0x80, 0x6b, 0x43, 0
+};
+
+static uint8_t msg8[] = "";
+
+static DigestSHA512 expResultDigest1 = {
+	0x07e547d9586f6a73, 0xf73fbac0435ed769, 0x51218fb7d0c8d788, 0xa309d785436bbb64,
+	0x2e93a252a954f239, 0x12547d1e8a3b5ed6, 0xe1bfd7097821233f, 0xa0538f3db854fee6
+};
+
+static DigestSHA512 expResultDigest2 = {
+	0x91ea1245f20d46ae, 0x9a037a989f54f1f7, 0x90f0a47607eeb8a1, 0x4d12890cea77a1bb,
+	0xc6c7ed9cf205e67b, 0x7f2b8fd4c7dfd3a7, 0xa8617e45f3c463d4, 0x81c7e586c39ac1ed
+};
+
+static DigestSHA512 expResultDigest3 = {
+	0x7952585e5330cb24, 0x7d72bae696fc8a6b, 0x0f7d0804577e347d, 0x99bc1b11e52f3849,
+	0x85a428449382306a, 0x89261ae143c2f3fb, 0x613804ab20b42dc0, 0x97e5bf4a96ef919b
+};
+
+static DigestSHA512 expResultDigest4 = {
+	0x5886828959d1f822, 0x54068be0bd14b6a8, 0x8f59f534061fb203, 0x76a0541052dd3635,
+	0xedf3c6f0ca3d0877, 0x5e13525df9333a21, 0x13c0b2af76515887, 0x529910b6c793c8a5
+};
+
+static DigestSHA512 expResultDigest5 = {
+	0xee1a56ee78182ec4, 0x1d2c3ab33d4c4187, 0x1d437c5c1ca060ee, 0x9e219cb83689b4e5,
+	0xa4174dfdab5d1d10, 0x96a31a7c8d3abda7, 0x5c1b5e6da97e1814, 0x901c505b0bc07f25
+};
+
+static DigestSHA512 expResultDigest6 = {
+	0xc36c100cdb6c8c45, 0xb072f18256d63a66, 0xc9843acb4d07de62, 0xe0600711d4fbe64c,
+	0x8cf314ec3457c903, 0x08147cb7ac7e4d07, 0x3ba10f0ced78ea72, 0x4a474b32dae71231
+};
+
+static DigestSHA512 expResultDigest7 = {
+	0x8e1c91729be8eb40, 0x226f6c58a029380e, 0xf7edb9dc166a5c3c, 0xdbcefe90bd30d85c,
+	0xb7c4b248e66abf0a, 0x3a4c842281299bef, 0x6db88858d9e5ab52, 0x44f70b7969e1c072
+};
+
+static DigestSHA512 expResultDigest8 = {
+	0Xcf83e1357eefb8bd, 0Xf1542850d66d8007, 0Xd620e4050b5715dc, 0X83f4a921d36ce9ce,
+	0X47d0d13c5d85f2b0, 0Xff8318d2877eec2f, 0X63b931bd47417a81, 0Xa538327af927da3e
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8 };
+
+static uint64_t *expResultDigest[MSGS] = { expResultDigest1, expResultDigest2,
+	expResultDigest3, expResultDigest4, expResultDigest5, expResultDigest6,
+	expResultDigest7, expResultDigest8
+};
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint64_t *good;
+
+	posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	sha512_ctx_mgr_init(mgr);
+
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = sha512_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+		}
+	}
+
+	while (1) {
+		ctx = sha512_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the "
+				       "submit. Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+
+		ctx = sha512_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+		}
+	}
+	while (1) {
+		ctx = sha512_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_sha512 test: Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..8af563068
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   1000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   10
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+inline uint64_t byteswap64(uint64_t x)
+{
+#if defined (__ICC)
+	return _bswap64(x);
+#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+	return __builtin_bswap64(x);
+#else
+	return (((x & (0xffull << 0)) << 56)
+		| ((x & (0xffull << 8)) << 40)
+		| ((x & (0xffull << 16)) << 24)
+		| ((x & (0xffull << 24)) << 8)
+		| ((x & (0xffull << 32)) >> 8)
+		| ((x & (0xffull << 40)) >> 24)
+		| ((x & (0xffull << 48)) >> 40)
+		| ((x & (0xffull << 56)) >> 56));
+#endif
+}
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	sha512_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha512_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sha512_ctx_mgr_submit(mgr,
+					      &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+		while (sha512_ctx_mgr_flush(mgr)) ;
+	}
+	perf_stop(&stop);
+
+	printf("multibinary_sha512" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    byteswap64(((uint64_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       byteswap64(((uint64_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+
+	printf("Multi-buffer sha512 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf("multibinary_sha512_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
new file mode 100644
index 000000000..d1167dd49
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
@@ -0,0 +1,438 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute SHA512 by-2 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax	  rdx		  r8 r9 r10 r11	   
+;; Windows preserves:	  rbx rcx     rsi rdi rbp		r12 r13 r14 r15
+;;
+;; Linux clobbers:    rax	      rsi	  r8 r9 r10 r11	   
+;; Linux preserves:	  rbx rcx rdx	  rdi rbp		r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+%define arg1	rdi
+%define arg2	rsi
+%else 
+; Windows definitions
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+; Common definitions
+%define STATE	 arg1
+%define INP_SIZE arg2
+
+%define IDX	rax
+%define ROUND	r8
+%define TBL	r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+%define SZ2	2*SHA512_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA:		resb	SZ2 * 16
+_DIGEST:	resb	SZ2 * NUM_SHA512_DIGEST_WORDS
+		resb	8	; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPD	vmovupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like 
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+	vshufpd	%%t0, %%r0, %%r1, 11b	; t0 = b1 a1
+	vshufpd	%%r0, %%r0, %%r1, 00b	; r0 = b0 a0
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsllq	%%tmp, %%reg, (64-(%%imm))
+	vpsrlq	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsllq	%%tmp, %%src, (64-(%%imm))
+	vpsrlq	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+	PRORQ	%1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+	PRORQ_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORQ_nd a0, e, (18-14)	; sig1: a0 = (e >> 4)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e	; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g	; a2 = ch
+
+	PRORQ_nd a1, e, 41	; sig1: a1 = (e >> 41)
+	vmovdqa	[SZ2*(%%i&0xf) + rsp + _DATA],%%T1
+	vpaddq	%%T1,%%T1,[TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORQ	a0, 14		; sig1: a0 = (e >> 14) ^ (e >> 18)
+	vpaddq	h, h, a2	; h = h + ch
+	PRORQ_nd a2, a, (34-28)	; sig0: a2 = (a >> 6)
+	vpaddq	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	vmovdqa	%%T1, a		; maj: T1 = a
+	PRORQ_nd a1, a, 39	; sig0: a1 = (a >> 39)
+	vpxor	%%T1, %%T1, c	; maj: T1 = a^c
+	add	ROUND, SZ2 ; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddq	h, h, a0
+
+	vpaddq	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORQ	a2, 28		; sig0: a2 = (a >> 28) ^ (a >> 34)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddq	h, h, a1	; h = h + ch + W + K + maj
+	vpaddq	h, h, a2	; h = h + ch + W + K + maj + sigma0
+	ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA]
+	vmovdqa	a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA]
+	vmovdqa	a0, %%T1
+	PRORQ	%%T1, 8-1
+	vmovdqa	a2, a1
+	PRORQ	a1, 61-19
+	vpxor	%%T1, %%T1, a0
+	PRORQ	%%T1, 1
+	vpxor	a1, a1, a2
+	PRORQ	a1, 19
+	vpsrlq	a0, a0, 7
+	vpxor	%%T1, %%T1, a0
+	vpsrlq	a2, a2, 6
+	vpxor	a1, a1, a2
+	vpaddq	%%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA]
+	vpaddq	a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA]
+	vpaddq	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_mb_x2_avx(SHA512_MB_ARGS_X4 *args, uint64_t msg_size_in_blocks)
+;; arg 1 : STATE    : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+global sha512_mb_x2_avx:function internal
+align 32
+sha512_mb_x2_avx:
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+
+	sub	rsp, STACK_size
+
+	;; Load the pre-transposed incoming digest. 
+	vmovdqa	a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+	lea	TBL,[K512_2_MB]
+
+	;; load the address of each of the 2 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ2], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ2], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ2], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ2], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ2], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ2], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ2], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+	;; load up the shuffler for little-endian to big-endian format
+	vmovdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK] 
+	VMOVPD	TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+	VMOVPD	TT2,[inp1+IDX+i*16]
+
+	TRANSPOSE	TT0, TT2, TT1
+	vpshufb	TT0, TT0, TMP 
+	vpshufb	TT1, TT1, TMP
+
+	ROUND_00_15	TT0,(i*2+0) 
+	ROUND_00_15	TT1,(i*2+1) 
+%assign i (i+1)
+%endrep
+
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+	add	IDX, 8 * 16
+    
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+   
+	;; add old digest
+	vpaddq	a, a, [rsp + _DIGEST + 0*SZ2]
+	vpaddq	b, b, [rsp + _DIGEST + 1*SZ2]
+	vpaddq	c, c, [rsp + _DIGEST + 2*SZ2]
+	vpaddq	d, d, [rsp + _DIGEST + 3*SZ2]
+	vpaddq	e, e, [rsp + _DIGEST + 4*SZ2]
+	vpaddq	f, f, [rsp + _DIGEST + 5*SZ2]
+	vpaddq	g, g, [rsp + _DIGEST + 6*SZ2]
+	vpaddq	h, h, [rsp + _DIGEST + 7*SZ2]
+ 
+	sub	INP_SIZE, 1 ;; consumed one message block 
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	vmovdqa	[STATE+0*SHA512_DIGEST_ROW_SIZE],a
+	vmovdqa	[STATE+1*SHA512_DIGEST_ROW_SIZE],b
+	vmovdqa	[STATE+2*SHA512_DIGEST_ROW_SIZE],c
+	vmovdqa	[STATE+3*SHA512_DIGEST_ROW_SIZE],d
+	vmovdqa	[STATE+4*SHA512_DIGEST_ROW_SIZE],e
+	vmovdqa	[STATE+5*SHA512_DIGEST_ROW_SIZE],f
+	vmovdqa	[STATE+6*SHA512_DIGEST_ROW_SIZE],g
+	vmovdqa	[STATE+7*SHA512_DIGEST_ROW_SIZE],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+	add	inp1, IDX
+	mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, STACK_size
+
+	; outer calling routine restores XMM and other GP registers
+	ret
+
+section .data
+K512_2_MB:
+	dq 0x428a2f98d728ae22, 0x428a2f98d728ae22
+	dq 0x7137449123ef65cd, 0x7137449123ef65cd
+	dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+	dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538, 0x3956c25bf348b538
+	dq 0x59f111f1b605d019, 0x59f111f1b605d019
+	dq 0x923f82a4af194f9b, 0x923f82a4af194f9b
+	dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242, 0xd807aa98a3030242
+	dq 0x12835b0145706fbe, 0x12835b0145706fbe
+	dq 0x243185be4ee4b28c, 0x243185be4ee4b28c
+	dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f, 0x72be5d74f27b896f
+	dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+	dq 0x9bdc06a725c71235, 0x9bdc06a725c71235
+	dq 0xc19bf174cf692694, 0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+	dq 0xefbe4786384f25e3, 0xefbe4786384f25e3
+	dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+	dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275, 0x2de92c6f592b0275
+	dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+	dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+	dq 0x76f988da831153b5, 0x76f988da831153b5
+	dq 0x983e5152ee66dfab, 0x983e5152ee66dfab
+	dq 0xa831c66d2db43210, 0xa831c66d2db43210
+	dq 0xb00327c898fb213f, 0xb00327c898fb213f
+	dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+	dq 0xd5a79147930aa725, 0xd5a79147930aa725
+	dq 0x06ca6351e003826f, 0x06ca6351e003826f
+	dq 0x142929670a0e6e70, 0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+	dq 0x2e1b21385c26c926, 0x2e1b21385c26c926
+	dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+	dq 0x53380d139d95b3df, 0x53380d139d95b3df
+	dq 0x650a73548baf63de, 0x650a73548baf63de
+	dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+	dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+	dq 0x92722c851482353b, 0x92722c851482353b
+	dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+	dq 0xa81a664bbc423001, 0xa81a664bbc423001
+	dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+	dq 0xc76c51a30654be30, 0xc76c51a30654be30
+	dq 0xd192e819d6ef5218, 0xd192e819d6ef5218
+	dq 0xd69906245565a910, 0xd69906245565a910
+	dq 0xf40e35855771202a, 0xf40e35855771202a
+	dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+	dq 0x1e376c085141ab53, 0x1e376c085141ab53
+	dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+	dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+	dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+	dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+	dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+	dq 0x78a5636f43172f60, 0x78a5636f43172f60
+	dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+	dq 0x8cc702081a6439ec, 0x8cc702081a6439ec
+	dq 0x90befffa23631e28, 0x90befffa23631e28
+	dq 0xa4506cebde82bde9, 0xa4506cebde82bde9
+	dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+	dq 0xc67178f2e372532b, 0xc67178f2e372532b
+	dq 0xca273eceea26619c, 0xca273eceea26619c
+	dq 0xd186b8c721c0c207, 0xd186b8c721c0c207
+	dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+	dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba, 0x06f067aa72176fba
+	dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+	dq 0x113f9804bef90dae, 0x113f9804bef90dae
+	dq 0x1b710b35131c471b, 0x1b710b35131c471b
+	dq 0x28db77f523047d84, 0x28db77f523047d84
+	dq 0x32caab7b40c72493, 0x32caab7b40c72493
+	dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+	dq 0x431d67c49c100d4c, 0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+	dq 0x597f299cfc657e2a, 0x597f299cfc657e2a
+	dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+	dq 0x6c44198c4a475817, 0x6c44198c4a475817
+
+
+align 32
+; one from sha512_rorx
+; this does the big endian to little endian conversion
+; over a quad word
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+			 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
new file mode 100644
index 000000000..f492021ae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
@@ -0,0 +1,420 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute SHA512 by-2 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax         rdx             r8 r9 r10 r11    
+;; Windows preserves:     rbx rcx     rsi rdi rbp               r12 r13 r14 r15
+;;
+;; Linux clobbers:    rax             rsi         r8 r9 r10 r11    
+;; Linux preserves:       rbx rcx rdx     rdi rbp               r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+ %define arg1    rdi
+ %define arg2    rsi
+%else 
+; Windows definitions
+ %define arg1    rcx
+ %define arg2    rdx
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND   r8
+%define TBL     r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+%define SZ2	2*SHA512_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA:		resb	SZ2 * 16
+_DIGEST:	resb	SZ2 * NUM_SHA512_DIGEST_WORDS
+		resb	8 	; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPD	movupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like 
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+	movapd  %%t0, %%r0		; t0 = a1 a0
+	shufpd	%%r0, %%r1, 00b		; r0 = b0 a0
+	shufpd	%%t0, %%r1, 11b		; t0 = b1 a1
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa	%%tmp, %%reg
+	psllq  	%%tmp, (64-(%%imm))
+	psrlq	%%reg, %%imm
+	por		%%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+	PRORQ	%1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORQ	a0, (18-14)	; sig1: a0 = (e >> 4)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORQ	a1, 41		; sig1: a1 = (e >> 41)
+	movdqa	[SZ2*(%%i&0xf) + rsp],%%T1
+	paddq	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORQ	a0, 14		; sig1: a0 = (e >> 14) ^ (e >> 18)
+	paddq	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORQ	a2, (34-28)	; sig0: a2 = (a >> 6)
+	paddq	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORQ	a1, 39		; sig0: a1 = (a >> 39)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ2	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddq	h, a0
+
+	paddq	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORQ	a2, 28		; sig0: a2 = (a >> 28) ^ (a >> 34)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddq	h, a1		; h = h + ch + W + K + maj
+	paddq	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	%%T1, [SZ2*((%%i-15)&0xf) + rsp]
+	movdqa	a1, [SZ2*((%%i-2)&0xf) + rsp]
+	movdqa	a0, %%T1
+	PRORQ	%%T1, 8-1
+	movdqa	a2, a1
+	PRORQ	a1, 61-19
+	pxor	%%T1, a0
+	PRORQ	%%T1, 1
+	pxor	a1, a2
+	PRORQ	a1, 19
+	psrlq	a0, 7
+	pxor	%%T1, a0
+	psrlq	a2, 6
+	pxor	a1, a2
+	paddq	%%T1, [SZ2*((%%i-16)&0xf) + rsp]
+	paddq	a1, [SZ2*((%%i-7)&0xf) + rsp]
+	paddq	%%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_x2_sse(SHA512_MB_ARGS_X4 *args, uint64_t num_blocks); 
+;; arg 1 : STATE    : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+global sha512_mb_x2_sse:function internal
+align 32
+sha512_mb_x2_sse:
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+	sub	rsp, STACK_size
+
+	;; Load the pre-transposed incoming digest. 
+	movdqa	a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+	lea	TBL,[K512_2_MB]
+	
+	;; load the address of each of the 2 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha512  +0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha512  +1*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+	;; save old digest
+	movdqa	[rsp + _DIGEST + 0*SZ2], a
+	movdqa	[rsp + _DIGEST + 1*SZ2], b
+	movdqa	[rsp + _DIGEST + 2*SZ2], c
+	movdqa	[rsp + _DIGEST + 3*SZ2], d
+	movdqa	[rsp + _DIGEST + 4*SZ2], e
+	movdqa	[rsp + _DIGEST + 5*SZ2], f
+	movdqa	[rsp + _DIGEST + 6*SZ2], g
+	movdqa	[rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+	;; load up the shuffler for little-endian to big-endian format
+	movdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK] 
+	MOVPD	TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+	MOVPD	TT2,[inp1+IDX+i*16]
+	TRANSPOSE	TT0, TT2, TT1
+	pshufb	TT0, TMP 
+	pshufb	TT1, TMP
+	ROUND_00_15	TT0,(i*2+0) 
+	ROUND_00_15	TT1,(i*2+1) 
+%assign i (i+1)
+%endrep
+	add	IDX, 8 * 16 ;; increment by a message block 
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+   
+	;; add old digest
+	paddq	a, [rsp + _DIGEST + 0*SZ2]
+	paddq	b, [rsp + _DIGEST + 1*SZ2]
+	paddq	c, [rsp + _DIGEST + 2*SZ2]
+	paddq	d, [rsp + _DIGEST + 3*SZ2]
+	paddq	e, [rsp + _DIGEST + 4*SZ2]
+	paddq	f, [rsp + _DIGEST + 5*SZ2]
+	paddq	g, [rsp + _DIGEST + 6*SZ2]
+	paddq	h, [rsp + _DIGEST + 7*SZ2]
+ 
+	sub	INP_SIZE, 1  ;; unit is blocks
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	movdqa	[STATE + 0*SHA512_DIGEST_ROW_SIZE],a
+	movdqa	[STATE + 1*SHA512_DIGEST_ROW_SIZE],b
+	movdqa	[STATE + 2*SHA512_DIGEST_ROW_SIZE],c
+	movdqa	[STATE + 3*SHA512_DIGEST_ROW_SIZE],d
+	movdqa	[STATE + 4*SHA512_DIGEST_ROW_SIZE],e
+	movdqa	[STATE + 5*SHA512_DIGEST_ROW_SIZE],f
+	movdqa	[STATE + 6*SHA512_DIGEST_ROW_SIZE],g
+	movdqa	[STATE + 7*SHA512_DIGEST_ROW_SIZE],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _data_ptr_sha512  + 0*PTR_SZ], inp0
+	add	inp1, IDX
+	mov	[STATE + _data_ptr_sha512  + 1*PTR_SZ], inp1
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, STACK_size
+	ret
+
+section .data
+align 64
+global K512_2_MB:data internal
+K512_2_MB:
+	dq	0x428a2f98d728ae22, 0x428a2f98d728ae22
+	dq	0x7137449123ef65cd, 0x7137449123ef65cd
+	dq	0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+	dq	0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+	dq	0x3956c25bf348b538, 0x3956c25bf348b538
+	dq	0x59f111f1b605d019, 0x59f111f1b605d019
+	dq	0x923f82a4af194f9b, 0x923f82a4af194f9b
+	dq	0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+	dq	0xd807aa98a3030242, 0xd807aa98a3030242
+	dq	0x12835b0145706fbe, 0x12835b0145706fbe
+	dq	0x243185be4ee4b28c, 0x243185be4ee4b28c
+	dq	0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+	dq	0x72be5d74f27b896f, 0x72be5d74f27b896f
+	dq	0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+	dq	0x9bdc06a725c71235, 0x9bdc06a725c71235
+	dq	0xc19bf174cf692694, 0xc19bf174cf692694
+	dq	0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+	dq	0xefbe4786384f25e3, 0xefbe4786384f25e3
+	dq	0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+	dq	0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+	dq	0x2de92c6f592b0275, 0x2de92c6f592b0275
+	dq	0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+	dq	0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+	dq	0x76f988da831153b5, 0x76f988da831153b5
+	dq	0x983e5152ee66dfab, 0x983e5152ee66dfab
+	dq	0xa831c66d2db43210, 0xa831c66d2db43210
+	dq	0xb00327c898fb213f, 0xb00327c898fb213f
+	dq	0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+	dq	0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+	dq	0xd5a79147930aa725, 0xd5a79147930aa725
+	dq	0x06ca6351e003826f, 0x06ca6351e003826f
+	dq	0x142929670a0e6e70, 0x142929670a0e6e70
+	dq	0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+	dq	0x2e1b21385c26c926, 0x2e1b21385c26c926
+	dq	0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+	dq	0x53380d139d95b3df, 0x53380d139d95b3df
+	dq	0x650a73548baf63de, 0x650a73548baf63de
+	dq	0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+	dq	0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+	dq	0x92722c851482353b, 0x92722c851482353b
+	dq	0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+	dq	0xa81a664bbc423001, 0xa81a664bbc423001
+	dq	0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+	dq	0xc76c51a30654be30, 0xc76c51a30654be30
+	dq	0xd192e819d6ef5218, 0xd192e819d6ef5218
+	dq	0xd69906245565a910, 0xd69906245565a910
+	dq	0xf40e35855771202a, 0xf40e35855771202a
+	dq	0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+	dq	0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+	dq	0x1e376c085141ab53, 0x1e376c085141ab53
+	dq	0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+	dq	0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+	dq	0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+	dq	0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+	dq	0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+	dq	0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+	dq	0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+	dq	0x78a5636f43172f60, 0x78a5636f43172f60
+	dq	0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+	dq	0x8cc702081a6439ec, 0x8cc702081a6439ec
+	dq	0x90befffa23631e28, 0x90befffa23631e28
+	dq	0xa4506cebde82bde9, 0xa4506cebde82bde9
+	dq	0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+	dq	0xc67178f2e372532b, 0xc67178f2e372532b
+	dq	0xca273eceea26619c, 0xca273eceea26619c
+	dq	0xd186b8c721c0c207, 0xd186b8c721c0c207
+	dq	0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+	dq	0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+	dq	0x06f067aa72176fba, 0x06f067aa72176fba
+	dq	0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+	dq	0x113f9804bef90dae, 0x113f9804bef90dae
+	dq	0x1b710b35131c471b, 0x1b710b35131c471b
+	dq	0x28db77f523047d84, 0x28db77f523047d84
+	dq	0x32caab7b40c72493, 0x32caab7b40c72493
+	dq	0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+	dq	0x431d67c49c100d4c, 0x431d67c49c100d4c
+	dq	0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+	dq	0x597f299cfc657e2a, 0x597f299cfc657e2a
+	dq	0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+	dq	0x6c44198c4a475817, 0x6c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: dq	0x0001020304050607, 0x08090a0b0c0d0e0f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
new file mode 100644
index 000000000..6931bedc1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
@@ -0,0 +1,483 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute quad SHA512 using AVX2
+;; use YMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers:  rax rbx     rdx             r8 r9 r10 r11 r12 
+;; Windows preserves:         rcx     rsi rdi rbp                   r13 r14 r15
+;;
+;; Linux clobbers:    rax rbx rcx     rsi         r8 r9 r10 r11 r12 
+;; Linux preserves:           rcx rdx     rdi rbp                   r13 r14 r15
+;;
+;; clobbers ymm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1        rdi
+%define arg2        rsi
+%else
+; Windows register definitions
+%define arg1        rcx
+%define arg2        rdx
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND   rbx
+%define TBL      r8
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1  ymm14
+%define TMP ymm15
+
+%define SZ4	4*SHA512_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 80*SZ4
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+  .data		resb	16*SZ4
+  .digest	resb	NUM_SHA512_DIGEST_WORDS*SZ4
+  .align	resb	24
+endstruc
+
+%define _DIGEST stack_frame.digest
+
+%define VMOVPD	vmovupd
+
+; operates on YMMs
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+; 
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	; vshufps does not cross the mid-way boundary and hence is cheaper
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+
+	vperm2f128 %%r1, %%r0, %%r2, 0x20; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+
+	vperm2f128 %%r3, %%r0, %%r2, 0x31; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+
+	vperm2f128 %%r0, %%t0, %%t1, 0x31; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+    
+	; now ok to clobber t0
+	vperm2f128 %%t0, %%t0, %%t1, 0x20; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+
+%endmacro	
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsllq	%%tmp, %%reg, (64-(%%imm))
+	vpsrlq	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsllq	%%tmp, %%src, (64-(%%imm))
+	vpsrlq	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+	PRORQ	%1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+	PRORQ_nd	%1, %3, TMP, %2
+%endmacro
+
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORQ_nd a0, e, (18-14)	; sig1: a0 = (e >> 4)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e	; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g	; a2 = ch
+
+	PRORQ_nd a1, e, 41	; sig1: a1 = (e >> 41)
+	vmovdqa	[SZ4*(%%i&0xf) + rsp],%%T1
+	vpaddq	%%T1,%%T1,[TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORQ	a0, 14		; sig1: a0 = (e >> 14) ^ (e >> 18)
+	vpaddq	h, h, a2	; h = h + ch
+	PRORQ_nd a2, a, (34-28)	; sig0: a2 = (a >> 6)
+	vpaddq	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	vmovdqa	%%T1, a		; maj: T1 = a
+	PRORQ_nd a1, a, 39	; sig0: a1 = (a >> 39)
+	vpxor	%%T1, %%T1, c	; maj: T1 = a^c
+	add	ROUND, SZ4 ; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddq	h, h, a0
+
+	vpaddq	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORQ	a2, 28		; sig0: a2 = (a >> 28) ^ (a >> 34)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddq	h, h, a1	; h = h + ch + W + K + maj
+	vpaddq	h, h, a2	; h = h + ch + W + K + maj + sigma0
+	ROTATE_ARGS
+  
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp]
+	vmovdqa	a1, [SZ4*((%%i-2)&0xf) + rsp]
+	vmovdqa	a0, %%T1
+	PRORQ	%%T1, 8-1
+	vmovdqa	a2, a1
+	PRORQ	a1, 61-19
+	vpxor	%%T1, %%T1, a0
+	PRORQ	%%T1, 1
+	vpxor	a1, a1, a2
+	PRORQ	a1, 19
+	vpsrlq	a0, a0, 7
+	vpxor	%%T1, %%T1, a0
+	vpsrlq	a2, a2, 6
+	vpxor	a1, a1, a2
+	vpaddq	%%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+	vpaddq	a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+	vpaddq	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha512_mb_x4_avx2(SHA512_MB_ARGS_X4 *STATE, const int INP_SIZE)
+;; arg 1 : STATE    : pointer to input data
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+global sha512_mb_x4_avx2:function internal
+align 32
+sha512_mb_x4_avx2:
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+
+	sub	rsp, stack_frame_size
+
+     ;; Load the pre-transposed incoming digest. 
+	vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE] 
+	vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE] 
+	vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE] 
+	vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE] 
+	vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE] 
+	vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE] 
+
+
+	lea	TBL,[K512_4_MB]
+    
+	;; load the address of each of the MAX_LANES (4)  message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ]
+	mov	inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ]
+	mov	inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ4], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ4], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ4], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ4], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ4], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ4], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ4], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+	;; load up the shuffler for little-endian to big-endian format
+	vmovdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK] 
+	VMOVPD	TT2,[inp0+IDX+i*32]
+	VMOVPD	TT1,[inp1+IDX+i*32]
+	VMOVPD	TT4,[inp2+IDX+i*32]
+	VMOVPD	TT3,[inp3+IDX+i*32]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	vpshufb	TT0, TT0, TMP 
+	vpshufb	TT1, TT1, TMP
+	vpshufb	TT2, TT2, TMP
+	vpshufb	TT3, TT3, TMP
+	ROUND_00_15	TT0,(i*4+0) 
+	ROUND_00_15	TT1,(i*4+1) 
+	ROUND_00_15	TT2,(i*4+2) 
+	ROUND_00_15	TT3,(i*4+3) 
+%assign i (i+1)
+%endrep
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+	add	IDX, 4 * 32
+    
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+   
+	;; add old digest
+	vpaddq	a, a, [rsp + _DIGEST + 0*SZ4]
+	vpaddq	b, b, [rsp + _DIGEST + 1*SZ4]
+	vpaddq	c, c, [rsp + _DIGEST + 2*SZ4]
+	vpaddq	d, d, [rsp + _DIGEST + 3*SZ4]
+	vpaddq	e, e, [rsp + _DIGEST + 4*SZ4]
+	vpaddq	f, f, [rsp + _DIGEST + 5*SZ4]
+	vpaddq	g, g, [rsp + _DIGEST + 6*SZ4]
+	vpaddq	h, h, [rsp + _DIGEST + 7*SZ4]
+ 
+	sub	INP_SIZE, 1 ;; consumed one message block 
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	vmovdqu	[STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a
+	vmovdqu	[STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b
+	vmovdqu	[STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c
+	vmovdqu	[STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d
+	vmovdqu	[STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e
+	vmovdqu	[STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f
+	vmovdqu	[STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g
+	vmovdqu	[STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h
+    
+	;; update input data pointers
+	add inp0, IDX
+	mov	[STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+	add inp1, IDX
+	mov	[STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+	add inp2, IDX
+	mov	[STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2
+	add inp3, IDX
+	mov	[STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add rsp, stack_frame_size
+
+	; outer calling routine restores XMM and other GP registers
+	ret
+
+section .data
+align 64
+K512_4_MB:
+	dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+	dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+	dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+	dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+	dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+	dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+	dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+	dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+	dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+	dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+	dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+	dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+	dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+	dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+	dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+	dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+	dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+	dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+	dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+	dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+	dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+	dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+	dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+	dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+	dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+	dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+	dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+	dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+	dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+	dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+	dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+	dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+	dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+	dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+	dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+	dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+	dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+	dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+	dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+	dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+	dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+	dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+	dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+	dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+	dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+	dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+	dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+	dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+	dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+	dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+	dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+	dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+	dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+	dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+	dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+	dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+	dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+	dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+	dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+	dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+	dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+	dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+	dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+	dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+	dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+	dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+	dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+	dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+			 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
new file mode 100644
index 000000000..cc8d85122
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
@@ -0,0 +1,639 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+default rel
+;; code to compute quad SHA512 using AVX512
+;; use ZMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; zmm0-31
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers:  rax rbx     rdx     rdi  rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx     rsi
+;;
+;; Linux clobbers:    rax rbx rcx     rsi     rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:               rdx     rdi
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx	; arg0 preserved
+   %define arg2 rdx	; arg1
+   %define reg3 r8	; arg2 preserved
+   %define reg4 r9	; arg3
+   %define var1 rdi	; usable
+   %define var2 rsi
+   %define local_func_decl(func_name) global func_name
+ %else
+   %define arg1 rdi	; arg0
+   %define arg2 rsi	; arg1
+   %define var2 rdx	; arg2
+   %define var1 rcx	; arg3 usable
+   %define local_func_decl(func_name) global func_name:function internal
+%endif
+
+%define state    arg1
+%define num_blks arg2
+
+%define	IN	(state + _data_ptr)
+%define DIGEST	state
+%define SIZE	num_blks
+
+%define IDX  var1
+%define TBL  r8
+
+%define VMOVDQ32  vmovdqu32
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*8
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%define NUM_LANES          8
+%define SZ                 8
+%define SZ8	           8 * SZ
+%define DIGEST_SZ          8 * SZ8
+%define DIGEST_SAVE	   NUM_LANES * DIGEST_SZ
+%define RSP_SAVE           1*8
+
+; Define Stack Layout
+START_FIELDS
+;;;     name            size            align
+FIELD	_DIGEST_SAVE,	NUM_LANES*8*64,	64
+FIELD	_RSP,		8,	        8
+%assign STACK_SPACE	_FIELD_OFFSET
+
+
+%define inp0	r9
+%define inp1	r10
+%define inp2	r11
+%define inp3	r12
+%define inp4	r13
+%define inp5	r14
+%define inp6	r15
+%define inp7	rax
+
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define F	zmm5
+%define G	zmm6
+%define H	zmm7
+%define T1	zmm8
+%define TMP0	zmm9
+%define TMP1	zmm10
+%define TMP2	zmm11
+%define TMP3	zmm12
+%define TMP4	zmm13
+%define TMP5	zmm14
+%define TMP6	zmm15
+
+
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+
+; from sha256_fips180-2.pdf
+; define rotates for Sigma function for main loop steps
+%define BIG_SIGMA_0_0 28	; Sigma0
+%define BIG_SIGMA_0_1 34
+%define BIG_SIGMA_0_2 39
+%define BIG_SIGMA_1_0 14	; Sigma1
+%define BIG_SIGMA_1_1 18
+%define BIG_SIGMA_1_2 41
+
+; define rotates for Sigma function for scheduling steps
+
+%define SMALL_SIGMA_0_0 1	; sigma0
+%define SMALL_SIGMA_0_1 8
+%define SMALL_SIGMA_0_2 7
+%define SMALL_SIGMA_1_0 19	; sigma1
+%define SMALL_SIGMA_1_1 61
+%define SMALL_SIGMA_1_2 6
+
+%define SHA_MAX_ROUNDS 80
+%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16)
+
+%macro TRANSPOSE8 12
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+%define %%PERM_INDEX1 %11
+%define %%PERM_INDEX2 %12
+
+
+; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities
+; r0  = {a7 a6 a5 a4   a3 a2 a1 a0}
+; r1  = {b7 b6 b5 b4   b3 b2 b1 b0}
+; r2  = {c7 c6 c5 c4   c3 c2 c1 c0}
+; r3  = {d7 d6 d5 d4   d3 d2 d1 d0}
+; r4  = {e7 e6 e5 e4   e3 e2 e1 e0}
+; r5  = {f7 f6 f5 f4   f3 f2 f1 f0}
+; r6  = {g7 g6 g5 g4   g3 g2 g1 g0}
+; r7  = {h7 h6 h5 h4   h3 h2 h1 h0}
+
+        ;; ;;;  will not get clobbered
+        vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ;  temp
+        vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2]  ; temp
+
+        ; process top half (r0..r3) {a...d}
+        vshufpd	%%t0, %%r0, %%r1, 0x00	; t0 = {b6 a6 b4 a4   b2 a2 b0 a0}
+        vshufpd	%%r0, %%r0, %%r1, 0xFF	; r0 = {b7 a7 b5 a5   b3 a3 b1 a1}
+        vshufpd	%%t1, %%r2, %%r3, 0x00	; t1 = {d6 c6 d4 c4   d2 c2 d0 c0}
+        vshufpd	%%r2, %%r2, %%r3, 0xFF	; r2 = {d7 c7 d5 c5   d3 c3 d1 c1}
+
+        vmovdqa32   %%r1, %%t0		     ; r1 and r3 free
+        vpermt2q    %%r1, %%PERM_INDEX1,%%t1   ; r1 = {d4 c4 b4 a4   d0 c0 b0 a0}
+        vpermt2q    %%t0, %%PERM_INDEX2,%%t1   ; t0 = {d6 c6 b6 a6   d2 c2 b2 a2}
+
+        vmovdqa32   %%t1, %%r0		       ; t1 and r3 free
+        vpermt2q    %%t1, %%PERM_INDEX1,%%r2   ; t1 = {d5 c5 b5 a5   d1 c1 b1 a1}
+        vpermt2q    %%r0, %%PERM_INDEX2,%%r2   ; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+
+        ;; Likewise for top half ; r2 and r3 free
+        vshufpd	%%r2, %%r4, %%r5, 0x00	; r2 = {f6 e6 f4 e4   f2 e2 f0 e0}
+        vshufpd	%%r4, %%r4, %%r5, 0xFF	; r4 = {f7 e7 f5 e5   f3 e3 f1 e1}
+        vshufpd	%%r3, %%r6, %%r7, 0x00	; r3 = {h6 g6 h4 g4   h2 g2 h0 g0}
+        vshufpd	%%r6, %%r6, %%r7, 0xFF	; r6 = {h7 g7 h5 g5   h3 g3 h1 g1}
+
+        vmovdqa32   %%r5, %%r2		     ; r5 and r7 free
+        vpermt2q    %%r5, %%PERM_INDEX1,%%r3   ; r5 = {h4 g4 f4 e4   h0 g0 f0 e0}
+        vpermt2q    %%r2, %%PERM_INDEX2,%%r3   ; r2 = {h6 g6 f6 e6   h2 g2 f2 e2}
+
+        vmovdqa32   %%r7, %%r4
+        vpermt2q    %%r7, %%PERM_INDEX1,%%r6   ; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+        vpermt2q    %%r4, %%PERM_INDEX2,%%r6   ; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+
+;;;  free r3, r6
+        vshuff64x2  %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+        vshuff64x2  %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+
+;;; t0 and r3 free
+        vshuff64x2  %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+        vshuff64x2  %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7   d7 c7 b7 a7}
+
+        vshuff64x2  %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+        vshuff64x2  %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+
+
+        vshuff64x2  %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+        vshuff64x2  %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+
+        ;;  will re-order input to avoid move
+        ;vmovdqa32   %%r7, %%t0
+
+        ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+        ; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+        ; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+        ; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+        ; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+        ; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+        ; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+        ; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+        ; temp
+        ; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+
+
+;;  CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_28  ^ ROR_34 ^ ROR_39
+;; SIGMA1 = ROR_14  ^ ROR_18 ^ ROR_41
+;; sigma0 = ROR_1  ^ ROR_8 ^ SHR_7
+;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6
+
+;; Main processing loop per round
+;;  equivalent to %macro ROUND_00_15 2
+%macro PROCESS_LOOP 2
+%define %%WT	%1
+%define %%ROUND	%2
+        ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt
+        ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C)
+        ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+        ;; H becomes T2, then add T1 for A
+        ;; D becomes D + T1 for E
+
+        vpaddq		T1, H, TMP3		; T1 = H + Kt
+        vmovdqa32	TMP0, E
+        ;; compute BIG_SIGMA_1(E)
+        vprorq		TMP1, E, BIG_SIGMA_1_0 		; ROR_14(E)
+        vprorq		TMP2, E, BIG_SIGMA_1_1		; ROR_18(E)
+        vprorq		TMP3, E, BIG_SIGMA_1_2		; ROR_41(E)
+        vpternlogq	TMP1, TMP2, TMP3, 0x96	; TMP1 = BIG_SIGMA_1(E)
+        vpternlogq	TMP0, F, G, 0xCA	; TMP0 = CH(E,F,G)
+        vpaddq		T1, T1, %%WT		; T1 = T1 + Wt
+        vpaddq		T1, T1, TMP0		; T1 = T1 + CH(E,F,G)
+        vpaddq		T1, T1, TMP1		; T1 = T1 + BIG_SIGMA_1(E)
+        vpaddq		D, D, T1		; D = D + T1
+        vprorq		H, A, BIG_SIGMA_0_0     ;ROR_28(A)
+        vprorq		TMP2, A, BIG_SIGMA_0_1  ;ROR_34(A)
+        vprorq		TMP3, A, BIG_SIGMA_0_2	;ROR_39(A)
+        vmovdqa32	TMP0, A
+        vpternlogq	TMP0, B, C, 0xE8	; TMP0 = MAJ(A,B,C)
+        vpternlogq	H, TMP2, TMP3, 0x96	; H(T2) = BIG_SIGMA_0(A)
+        vpaddq		H, H, TMP0		; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C)
+        vpaddq		H, H, T1		; H(A) = H(T2) + T1
+        vmovdqa32	TMP3, [TBL + ((%%ROUND+1)*64)]	; Next Kt
+
+        ;; Rotate the args A-H (rotation of names associated with regs)
+        ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT	%1
+%define %%WTp1	%2
+%define %%WTp9	%3
+%define %%WTp14	%4
+        vprorq		TMP4, %%WTp14, SMALL_SIGMA_1_0 	; ROR_19(Wt-2)
+        vprorq		TMP5, %%WTp14, SMALL_SIGMA_1_1 	; ROR_61(Wt-2)
+        vpsrlq		TMP6, %%WTp14, SMALL_SIGMA_1_2 	; SHR_6(Wt-2)
+        vpternlogq	TMP4, TMP5, TMP6, 0x96	        ; TMP4 = sigma_1(Wt-2)
+
+        vpaddq		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma_1(Wt-2)
+        vpaddq		%%WT, %%WT, %%WTp9	; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7
+
+        vprorq		TMP4, %%WTp1, SMALL_SIGMA_0_0 	; ROR_1(Wt-15)
+        vprorq		TMP5, %%WTp1, SMALL_SIGMA_0_1 	; ROR_8(Wt-15)
+        vpsrlq		TMP6, %%WTp1, SMALL_SIGMA_0_2 	; SHR_7(Wt-15)
+        vpternlogq	TMP4, TMP5, TMP6, 0x96	        ; TMP4 = sigma_0(Wt-15)
+
+        vpaddq		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma_1(Wt-2) +
+                                                ; Wt-7 + sigma_0(Wt-15) +
+
+%endmacro
+
+align 64
+
+; void sha512_mb_x8_avx512(SHA512_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha512_mb_x8_avx512)
+sha512_mb_x8_avx512:
+        mov	rax, rsp
+        sub     rsp, STACK_SPACE
+        and	rsp, ~63	; align stack to multiple of 64
+        mov	[rsp + _RSP], rax
+	lea	TBL,[TABLE]
+
+    ;; Initialize digests
+    vmovups	A,    [DIGEST + 0*8*8]
+    vmovups	B,    [DIGEST + 1*8*8]
+    vmovups	C,    [DIGEST + 2*8*8]
+    vmovups	D,    [DIGEST + 3*8*8]
+    vmovups	E,    [DIGEST + 4*8*8]
+    vmovups	F,    [DIGEST + 5*8*8]
+    vmovups	G,    [DIGEST + 6*8*8]
+    vmovups	H,    [DIGEST + 7*8*8]
+
+    xor	IDX, IDX
+    ;; Read in input data address, saving them in registers because
+    ;; they will serve as variables, which we shall keep incrementing
+    mov	inp0, [IN + 0*8]
+    mov	inp1, [IN + 1*8]
+    mov	inp2, [IN + 2*8]
+    mov	inp3, [IN + 3*8]
+    mov	inp4, [IN + 4*8]
+    mov	inp5, [IN + 5*8]
+    mov	inp6, [IN + 6*8]
+    mov	inp7, [IN + 7*8]
+
+lloop:
+
+    ;;  first half of 1024 (need to transpose before use)
+    vmovups	W0,[inp0 + IDX ]
+    vmovups	W1,[inp1 + IDX ]
+    vmovups	W2,[inp2 + IDX ]
+    vmovups	W3,[inp3 + IDX ]
+    vmovups	W4,[inp4 + IDX ]
+    vmovups	W5,[inp5 + IDX ]
+    vmovups	W6,[inp6 + IDX ]
+    vmovups	TMP0,[inp7 + IDX ]
+    TRANSPOSE8  W0, W1, W2, W3, W4, W5, W6, TMP0,  W7, TMP1, TMP2, TMP3
+    ;;  second half of 1024 (need to transpose before use)
+    vmovups     W8,[inp0  + SZ8 + IDX ]
+    vmovups	W9,[inp1  + SZ8 + IDX ]
+    vmovups	W10,[inp2 + SZ8 + IDX ]
+    vmovups	W11,[inp3 + SZ8 + IDX ]
+    vmovups	W12,[inp4 + SZ8 + IDX ]
+    vmovups	W13,[inp5 + SZ8 + IDX ]
+    vmovups	W14,[inp6 + SZ8 + IDX ]
+    vmovups	TMP0,[inp7 + SZ8 + IDX ]
+    TRANSPOSE8  W8, W9, W10, W11, W12, W13, W14, TMP0,  W15, TMP1, TMP2, TMP3
+
+    vmovdqa32	TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+    vmovdqa32	TMP3, [TBL]	; First K
+
+        ; Save digests for later addition
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*0], A
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*1], B
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*2], C
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*3], D
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*4], E
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*5], F
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*6], G
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*7], H
+
+    add	IDX, 128  	; increment by message block length in bytes
+
+
+
+
+%assign I 0
+%rep 16
+;;;  little endian to big endian
+     vpshufb	APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+    ; Save digests for later addition
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*0], A
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*1], B
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*2], C
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*3], D
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*4], E
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*5], F
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*6], G
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*7], H
+
+        ; MSG Schedule for W0-W15 is now complete in registers
+        ; Process first (max-rounds -16)
+        ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+        ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep SHA_ROUNDS_LESS_16
+        PROCESS_LOOP  APPEND(W,J),  I
+        MSG_SCHED_ROUND_16_79  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+        ; Check is this is the last block
+        sub 	SIZE, 1
+        je	lastLoop
+
+        ; Process last 16 rounds
+        ; Read in next block msg data for use in first 16 words of msg sched
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+        PROCESS_LOOP  APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+       ; Add old digest
+        vpaddq		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddq		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddq		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddq		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddq		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddq		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddq		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddq		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+        jmp	lloop
+
+
+lastLoop:
+        ; Process last 16 rounds
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+
+%rep 16
+        PROCESS_LOOP  APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+        ; Add old digest
+        vpaddq		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddq		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddq		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddq		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddq		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddq		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddq		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddq		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+;; update into data pointers
+%assign I 0
+%rep 4
+        mov    inp0, [IN + (2*I)*8]
+        mov    inp1, [IN + (2*I +1)*8]
+        add    inp0, IDX
+        add    inp1, IDX
+        mov    [IN + (2*I)*8], inp0
+        mov    [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+        VMOVDQ32	[DIGEST + 0*8*8], A
+        VMOVDQ32	[DIGEST + 1*8*8], B
+        VMOVDQ32	[DIGEST + 2*8*8], C
+        VMOVDQ32	[DIGEST + 3*8*8], D
+        VMOVDQ32	[DIGEST + 4*8*8], E
+        VMOVDQ32	[DIGEST + 5*8*8], F
+        VMOVDQ32	[DIGEST + 6*8*8], G
+        VMOVDQ32	[DIGEST + 7*8*8], H
+
+        mov     rsp, [rsp + _RSP]
+        ret
+
+        section .data
+align 64
+; 80 constants for SHA512
+; replicating for each lane, thus 8*80
+; to aid in SIMD .. space tradeoff for time!
+; local to asm file, used nowhere else
+TABLE:
+    dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+    dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+    dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+    dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+    dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+    dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+    dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+    dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+    dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+    dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+    dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+    dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+    dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+    dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+    dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+    dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+    dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+    dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+    dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+    dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+    dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+    dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+    dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+    dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+    dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+    dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+    dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+    dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+    dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+    dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+    dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+    dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+    dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+    dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+    dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+    dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+    dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+    dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+    dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+    dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+    dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+    dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+    dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+    dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+    dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+    dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+    dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+    dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+    dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+    dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+    dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+    dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+    dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+    dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+    dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+    dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+    dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+    dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+    dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+    dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+    dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+    dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+    dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+    dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+    dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+    dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+    dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+    dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+    dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+    dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+    dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+    dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+    dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+    dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+    dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+    dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+    dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+    dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+    dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+    dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 64
+; this does the big endian to little endian conversion over a quad word .. ZMM
+;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+			 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+			 dq 0x2021222324252627, 0x28292a2b2c2d2e2f
+                         dq 0x3031323334353637, 0x38393a3b3c3d3e3f
+
+align 64
+TRANSPOSE8_PERM_INDEX_1: 	dq 0x0000000000000000
+                                dq 0x0000000000000001
+                                dq 0x0000000000000008
+                                dq 0x0000000000000009
+                                dq 0x0000000000000004
+                                dq 0x0000000000000005
+                                dq 0x000000000000000C
+                                dq 0x000000000000000D
+
+TRANSPOSE8_PERM_INDEX_2: 	dq 0x0000000000000002
+                                dq 0x0000000000000003
+                                dq 0x000000000000000A
+                                dq 0x000000000000000B
+                                dq 0x0000000000000006
+                                dq 0x0000000000000007
+                                dq 0x000000000000000E
+                                dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_x8_avx512
+no_sha512_mb_x8_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
new file mode 100644
index 000000000..e1186f8a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT		wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+;;;;;
+; mbin_dispatch_init_avoton parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+; 5-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init_avoton 5
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+		mov	eax, 1
+		cpuid
+		lea	mbin_rdi, [%5 WRT_OPT]
+		and     eax, FLAG_CPUID1_EAX_STEP_MASK
+		cmp     eax, FLAG_CPUID1_EAX_AVOTON
+		; If Avoton, set Avoton symbol and exit
+		cmove   mbin_rsi, mbin_rdi
+		je	_%1_init_done
+
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%2 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init6_avoton parameters
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init6_avoton 7
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		mov	ebx, ecx ; save cpuid1.ecx
+		test	ecx, FLAG_CPUID1_ECX_SSE4_1
+		je	_%1_init_done	  ; Use base function if no SSE4_1
+		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+		lea	mbin_rdi, [%7 WRT_OPT]
+		and     eax, FLAG_CPUID1_EAX_STEP_MASK
+		cmp     eax, FLAG_CPUID1_EAX_AVOTON
+		; If Avoton, set Avoton symbol and exit
+		cmove   mbin_rsi, mbin_rdi
+		je	_%1_init_done
+
+
+		;; Test for XMM_YMM support/AVX
+		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
+		je	_%1_init_done
+		xor	ecx, ecx
+		xgetbv	; xcr -> edx:eax
+		mov	edi, eax	  ; save xgetvb.eax
+
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		jne	_%1_init_done
+		test	ebx, FLAG_CPUID1_ECX_AVX
+		je	_%1_init_done
+		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+		;; Test for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		je	_%1_init_done		; No AVX2 possible
+		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
+
+		;; Test for AVX512
+		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		jne	_%1_init_done	  ; No AVX512 possible
+		and	ebx, FLAGS_CPUID7_ECX_AVX512_G1
+		cmp	ebx, FLAGS_CPUID7_ECX_AVX512_G1
+		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+default rel
+[bits 64]
+
+%define def_wrd 	dq
+%define wrd_sz  	qword
+%define arg1		rsi
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha512_ctx_mgr_init_sse
+extern sha512_ctx_mgr_submit_sse
+extern sha512_ctx_mgr_flush_sse
+
+extern sha512_ctx_mgr_init_avx
+extern sha512_ctx_mgr_submit_avx
+extern sha512_ctx_mgr_flush_avx
+
+extern sha512_ctx_mgr_init_avx2
+extern sha512_ctx_mgr_submit_avx2
+extern sha512_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha512_ctx_mgr_init_avx512
+ extern sha512_ctx_mgr_submit_avx512
+ extern sha512_ctx_mgr_flush_avx512
+%endif
+
+extern sha512_ctx_mgr_init_sb_sse4
+extern sha512_ctx_mgr_submit_sb_sse4
+extern sha512_ctx_mgr_flush_sb_sse4
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha512_ctx_mgr_init
+mbin_interface sha512_ctx_mgr_submit
+mbin_interface sha512_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \
+			sha512_ctx_mgr_init_sse, sha512_ctx_mgr_init_avx, \
+			sha512_ctx_mgr_init_avx2, sha512_ctx_mgr_init_avx512, \
+			sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \
+			sha512_ctx_mgr_submit_sse, sha512_ctx_mgr_submit_avx, \
+			sha512_ctx_mgr_submit_avx2, sha512_ctx_mgr_submit_avx512, \
+			sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \
+			sha512_ctx_mgr_flush_sse, sha512_ctx_mgr_flush_avx, \
+			sha512_ctx_mgr_flush_avx2, sha512_ctx_mgr_flush_avx512, \
+			sha512_ctx_mgr_flush_sb_sse4
+%else
+ mbin_dispatch_init_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \
+			sha512_ctx_mgr_init_avx, sha512_ctx_mgr_init_avx2, \
+			sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \
+			sha512_ctx_mgr_submit_avx, sha512_ctx_mgr_submit_avx2, \
+			sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \
+			sha512_ctx_mgr_flush_avx, sha512_ctx_mgr_flush_avx2, \
+			sha512_ctx_mgr_flush_sb_sse4
+%endif
+
+
+;;;       func				core, ver, snum
+slversion sha512_ctx_mgr_init,		00,   03,  0175
+slversion sha512_ctx_mgr_submit,	00,   03,  0176
+slversion sha512_ctx_mgr_flush,		00,   03,  0177
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
new file mode 100644
index 000000000..bb9a8f5e8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha512_mb.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA512 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define H0 0x6a09e667f3bcc908
+#define H1 0xbb67ae8584caa73b
+#define H2 0x3c6ef372fe94f82b
+#define H3 0xa54ff53a5f1d36f1
+#define H4 0x510e527fade682d1
+#define H5 0x9b05688c2b3e6c1f
+#define H6 0x1f83d9abfb41bd6b
+#define H7 0x5be0cd19137e2179
+
+void sha512_single(const uint8_t * data, uint64_t digest[5]);
+
+void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA512_BLOCK_SIZE];
+
+	/* 128 bit lengths not needed as len is uint32_t, so use 64 bit length
+	 * and pad the first 64 bits with zeros. */
+	union {
+		uint64_t uint;
+		uint8_t uchar[8];
+	} convert;
+	uint8_t *p;
+
+	digest[0] = H0;
+	digest[1] = H1;
+	digest[2] = H2;
+	digest[3] = H3;
+	digest[4] = H4;
+	digest[5] = H5;
+	digest[6] = H6;
+	digest[7] = H7;
+
+	i = len;
+	/* Hash the complete blocks */
+	while (i >= SHA512_BLOCK_SIZE) {
+		sha512_single(input_data, digest);
+		input_data += SHA512_BLOCK_SIZE;
+		i -= SHA512_BLOCK_SIZE;
+	}
+
+	/* Copy remainder to a buffer to be padded */
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+
+	// Pad more than required here and overwrite with length
+	for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA512_BLOCK_SIZE;
+	else
+		i = SHA512_BLOCK_SIZE;
+
+	convert.uint = 8 * len;
+	p = buf + i - 8;
+	p[0] = convert.uchar[7];
+	p[1] = convert.uchar[6];
+	p[2] = convert.uchar[5];
+	p[3] = convert.uchar[4];
+	p[4] = convert.uchar[3];
+	p[5] = convert.uchar[2];
+	p[6] = convert.uchar[1];
+	p[7] = convert.uchar[0];
+
+	/* Hash the padded last block */
+	sha512_single(buf, digest);
+	if (i == 256)
+		sha512_single(buf + 128, digest);
+}
+
+/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words
+ * instead of 32 bit. 
+ */
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+
+/* Sigma functions have same form as SHA256 but 
+ * 	- change the word size to 64bit
+ * 	- change the amount to rotate 
+ */
+#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r))))
+
+/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case 
+ * of the  S0 should be s0, but keep as-is to avoid confusion with the other reference functions.
+ */
+#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39))
+#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41))
+
+#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7))
+#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6))
+
+#define bswap(x)  (((x) & (0xffull << 0)) << 56) \
+		| (((x) & (0xffull << 8)) << 40) \
+		| (((x) & (0xffull <<16)) << 24) \
+		| (((x) & (0xffull <<24)) << 8)  \
+		| (((x) & (0xffull <<32)) >> 8)  \
+		| (((x) & (0xffull <<40)) >> 24) \
+		| (((x) & (0xffull <<48)) >> 40) \
+		| (((x) & (0xffull <<56)) >> 56)
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = bswap(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+void sha512_single(const uint8_t * data, uint64_t digest[5])
+{
+	/* Check these are all uint64_t */
+	uint64_t a, b, c, d, e, f, g, h, t1, t2;
+	uint64_t w[16];
+	uint64_t *ww = (uint64_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
+	step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
+	step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
+	step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
+	step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b);	// step 63
+	step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
+	step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
+	step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
+	step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
+	step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
+	step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
+	step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
+	step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
+	step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
+	step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
+	step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
+	step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
+	step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
+	step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
+	step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
+	step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817);	// step 79
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
new file mode 100644
index 000000000..6eeed19fd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/*
+ * Function: sha512_sb_mgr_flush_sse4.
+ *
+ * Description: This is a dummy API. Nothing done here.
+ *
+ * Return: always NULL.
+ *
+ * */
+SHA512_JOB *sha512_sb_mgr_flush_sse4(SHA512_MB_JOB_MGR * state)
+{
+	return NULL;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
new file mode 100644
index 000000000..93ce88dfd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
@@ -0,0 +1,38 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+// For single buffer APIs, nothing to be done here.
+// This function is required, to comply with the usage of 
+// multi-buffer APIs.
+void sha512_sb_mgr_init_sse4(SHA512_MB_JOB_MGR * state)
+{
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
new file mode 100644
index 000000000..659d14339
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/* 
+ * Function: sha512_sb_mgr_submit_sse4
+ *
+ * Description: Wrapper API for update routine of single buffer sha512,
+ *              to comply with multi-buffer API.
+ *
+ *              This function will pick up message/digest and length 
+ *              information from  the argument "job", then call into 
+ *              sha512_sse4(). Argument "state" is passed in, but not 
+ *              really used here.
+ *
+ *              Note: message init and padding is done outside. This function
+ *              expects a packed buffer.
+ *
+ * Argument: state - not really used.
+ *           job - contained message, digest, message length information, etc.
+ *
+ * Return: SHA512_JOB pointer.
+ *
+ **/
+SHA512_JOB *sha512_sb_mgr_submit_sse4(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+	assert(job != NULL);
+
+	uint8_t *buff = job->buffer;
+	uint64_t *digest = job->result_digest, len = job->len;
+
+	sha512_sse4((const void *)buff, (void *)digest, len);
+
+	return job;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
new file mode 100644
index 000000000..57598a0e2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
@@ -0,0 +1,394 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+default rel
+BITS 64
+section .text
+
+; Virtual Registers
+%ifidn __OUTPUT_FORMAT__, win64
+	%define msg     rcx ; ARG1
+	%define digest  rdx ; ARG2
+	%define msglen  r8  ; ARG3
+	%define T1      rsi
+	%define T2      rdi
+%else
+	%define msg     rdi ; ARG1
+	%define digest  rsi ; ARG2
+	%define msglen  rdx ; ARG3
+	%define T1      rcx
+	%define T2      r8
+%endif
+%define a_64    r9
+%define b_64    r10
+%define c_64    r11
+%define d_64    r12
+%define e_64    r13
+%define f_64    r14
+%define g_64    r15
+%define h_64    rbx
+%define tmp0    rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+	.W:       resq 80 ; Message Schedule
+	.WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	.GPRSAVE: resq 7
+%else
+	.GPRSAVE: resq 5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i)    msg    + 8*(i)               ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i)               ; Output Digest (arg2)
+%define K_t(i)    K512   + 8*(i)               ; SHA Constants (static mem)
+%define W_t(i)    rsp + frame.W  + 8*(i)       ; Message Schedule (stack frame)
+%define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+	; Rotate symbles a..h right
+	%xdefine %%TMP h_64
+	%xdefine h_64  g_64
+	%xdefine g_64  f_64
+	%xdefine f_64  e_64
+	%xdefine e_64  d_64
+	%xdefine d_64  c_64
+	%xdefine c_64  b_64
+	%xdefine b_64  a_64
+	%xdefine a_64  %%TMP
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t   (%1)
+
+	; Compute Round %%t
+	mov     T1,   f_64        ; T1 = f
+	mov     tmp0, e_64        ; tmp = e
+	xor     T1,   g_64        ; T1 = f ^ g
+	ror     tmp0, 23 ; 41     ; tmp = e ror 23
+	and     T1,   e_64        ; T1 = (f ^ g) & e
+	xor     tmp0, e_64        ; tmp = (e ror 23) ^ e
+	xor     T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	add     T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+	ror     tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
+	xor     tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov     T2,   a_64        ; T2 = a
+	add     T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
+	ror     tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add     T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov     tmp0, a_64        ; tmp = a
+	xor     T2,   c_64        ; T2 = a ^ c
+	and     tmp0, c_64        ; tmp = a & c
+	and     T2,   b_64        ; T2 = (a ^ c) & b
+	xor     T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov     tmp0, a_64        ; tmp = a
+	ror     tmp0, 5 ; 39      ; tmp = a ror 5
+	xor     tmp0, a_64        ; tmp = (a ror 5) ^ a
+	add     d_64, T1          ; e(next_state) = d + T1 
+	ror     tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
+	xor     tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea     h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
+	ror     tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add     h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_sse 1
+%assign %%t (%1)
+
+	; Compute rounds %%t-2 and %%t-1
+	; Compute message schedule QWORDS %%t and %%t+1
+
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and 
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	; scheduler.
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	; They are then added to their respective SHA512 constants at
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	;   For brievity, the comments following vectored instructions only refer to
+	; the first of a pair of QWORDS.
+	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+	;   The computation of the message schedule and the rounds are tightly
+	; stitched to take advantage of instruction-level parallelism.
+	; For clarity, integer instructions (for the rounds calculation) are indented
+	; by one tab. Vectored instructions (for the message scheduler) are indented
+	; by two tabs.
+
+	mov     T1, f_64
+	movdqa  xmm2, [W_t(%%t-2)]  ; XMM2 = W[t-2]
+	xor     T1,   g_64
+	and     T1,   e_64
+	movdqa  xmm0, xmm2          ; XMM0 = W[t-2]
+	xor     T1,   g_64
+	add     T1,   [WK_2(%%t)]
+	movdqu  xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
+	mov     tmp0, e_64
+	ror     tmp0, 23 ; 41
+	movdqa  xmm3, xmm5          ; XMM3 = W[t-15]
+	xor     tmp0, e_64
+	ror     tmp0, 4 ; 18
+	psrlq   xmm0, 61 - 19       ; XMM0 = W[t-2] >> 42
+	xor     tmp0, e_64
+	ror     tmp0, 14 ; 14
+	psrlq   xmm3, (8 - 7)       ; XMM3 = W[t-15] >> 1
+	add     T1,   tmp0
+	add     T1,   h_64
+	pxor    xmm0, xmm2          ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
+	mov     T2,   a_64
+	xor     T2,   c_64
+	pxor    xmm3, xmm5          ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
+	and     T2,   b_64
+	mov     tmp0, a_64
+	psrlq   xmm0, 19 - 6        ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+	and     tmp0, c_64
+	xor     T2,   tmp0
+	psrlq   xmm3, (7 - 1)       ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+	mov     tmp0, a_64
+	ror     tmp0, 5 ; 39
+	pxor    xmm0, xmm2          ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+	xor     tmp0, a_64
+	ror     tmp0, 6 ; 34
+	pxor    xmm3, xmm5          ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+	xor     tmp0, a_64
+	ror     tmp0, 28 ; 28
+	psrlq   xmm0, 6             ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+	add     T2,   tmp0
+	add     d_64, T1 
+	psrlq   xmm3, 1             ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+	lea     h_64, [T1 + T2]
+	RotateState
+	movdqa  xmm1, xmm2          ; XMM1 = W[t-2]
+	mov     T1, f_64
+	xor     T1,   g_64
+	movdqa  xmm4, xmm5          ; XMM4 = W[t-15]
+	and     T1,   e_64
+	xor     T1,   g_64
+	psllq   xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
+	add     T1,   [WK_2(%%t+1)]
+	mov     tmp0, e_64
+	psllq   xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
+	ror     tmp0, 23 ; 41
+	xor     tmp0, e_64
+	pxor    xmm1, xmm2          ; XMM1 = (W[t-2] << 42)^W[t-2]
+	ror     tmp0, 4 ; 18
+	xor     tmp0, e_64
+	pxor    xmm4, xmm5          ; XMM4 = (W[t-15]<<7)^W[t-15]
+	ror     tmp0, 14 ; 14
+	add     T1,   tmp0
+	psllq   xmm1, (64 - 61)     ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+	add     T1,   h_64
+	mov     T2,   a_64
+	psllq   xmm4, (64 - 8)      ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+	xor     T2,   c_64
+	and     T2,   b_64
+	pxor    xmm0, xmm1          ; XMM0 = s1(W[t-2])
+	mov     tmp0, a_64
+	and     tmp0, c_64
+	movdqu  xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
+	xor     T2,   tmp0
+	pxor    xmm3, xmm4          ; XMM3 = s0(W[t-15])
+	mov     tmp0, a_64
+	paddq   xmm0, xmm3          ; XMM0 = s1(W[t-2]) + s0(W[t-15])
+	ror     tmp0, 5 ; 39
+	paddq   xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+	xor     tmp0, a_64
+	paddq   xmm0, xmm1          ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	ror     tmp0, 6 ; 34
+	movdqa  [W_t(%%t)], xmm0    ; Store scheduled qwords
+	xor     tmp0, a_64
+	paddq   xmm0, [K_t(t)]      ; Compute W[t]+K[t]
+	ror     tmp0, 28 ; 28
+	movdqa  [WK_2(t)], xmm0     ; Store W[t]+K[t] for next rounds
+	add     T2,   tmp0
+	add     d_64, T1
+	lea     h_64, [T1 + T2]
+	RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_sse4(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks.
+global sha512_sse4:function
+sha512_sse4:
+	cmp msglen, 0
+	je .nowork
+	
+	; Allocate Stack Space
+	sub     rsp, frame_size
+
+	; Save GPRs
+	mov     [rsp + frame.GPRSAVE + 8 * 0], rbx
+	mov     [rsp + frame.GPRSAVE + 8 * 1], r12
+	mov     [rsp + frame.GPRSAVE + 8 * 2], r13
+	mov     [rsp + frame.GPRSAVE + 8 * 3], r14
+	mov     [rsp + frame.GPRSAVE + 8 * 4], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + frame.GPRSAVE + 8 * 5], rsi
+	mov     [rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+
+.updateblock:
+
+	; Load state variables
+	mov     a_64, [DIGEST(0)]
+	mov     b_64, [DIGEST(1)]
+	mov     c_64, [DIGEST(2)]
+	mov     d_64, [DIGEST(3)]
+	mov     e_64, [DIGEST(4)]
+	mov     f_64, [DIGEST(5)]
+	mov     g_64, [DIGEST(6)]
+	mov     h_64, [DIGEST(7)]
+
+	%assign t 0
+	%rep 80/2 + 1
+	; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	; +1 iteration because the scheduler leads hashing by 1 iteration
+		%if t < 2
+			; BSWAP 2 QWORDS
+			movdqa  xmm1, [XMM_QWORD_BSWAP]
+			movdqu  xmm0, [MSG(t)]
+			pshufb  xmm0, xmm1      ; BSWAP
+			movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
+			paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
+			movdqa  [WK_2(t)], xmm0 ; Store into WK for rounds
+		%elif t < 16
+			; BSWAP 2 QWORDS; Compute 2 Rounds
+			movdqu  xmm0, [MSG(t)]
+			pshufb  xmm0, xmm1      ; BSWAP
+			SHA512_Round t - 2      ; Round t-2
+			movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
+			paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
+			SHA512_Round t - 1      ; Round t-1
+			movdqa  [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
+		%elif t < 79
+			; Schedule 2 QWORDS; Compute 2 Rounds
+			SHA512_2Sched_2Round_sse t 
+		%else
+			; Compute 2 Rounds
+			SHA512_Round t - 2
+			SHA512_Round t - 1
+		%endif
+	%assign t t+2
+	%endrep
+
+	; Update digest
+	add     [DIGEST(0)], a_64
+	add     [DIGEST(1)], b_64
+	add     [DIGEST(2)], c_64
+	add     [DIGEST(3)], d_64
+	add     [DIGEST(4)], e_64
+	add     [DIGEST(5)], f_64
+	add     [DIGEST(6)], g_64
+	add     [DIGEST(7)], h_64
+
+	; Advance to next message block
+	add     msg, 16*8
+	dec     msglen
+	jnz     .updateblock
+
+	; Restore GPRs
+	mov     rbx, [rsp + frame.GPRSAVE + 8 * 0]
+	mov     r12, [rsp + frame.GPRSAVE + 8 * 1]
+	mov     r13, [rsp + frame.GPRSAVE + 8 * 2]
+	mov     r14, [rsp + frame.GPRSAVE + 8 * 3]
+	mov     r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rsi, [rsp + frame.GPRSAVE + 8 * 5]
+	mov     rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+	; Restore Stack Pointer
+	add     rsp, frame_size
+
+.nowork:
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP: 
+	dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+; K[t] used in SHA512 hashing
+K512:
+	dq 0x428a2f98d728ae22,0x7137449123ef65cd 
+	dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538,0x59f111f1b605d019 
+	dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242,0x12835b0145706fbe 
+	dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 
+	dq 0x9bdc06a725c71235,0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 
+	dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 
+	dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	dq 0x983e5152ee66dfab,0xa831c66d2db43210 
+	dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 
+	dq 0x06ca6351e003826f,0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 
+	dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	dq 0x650a73548baf63de,0x766a0abb3c77b2a8 
+	dq 0x81c2c92e47edaee6,0x92722c851482353b
+	dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 
+	dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+	dq 0xd192e819d6ef5218,0xd69906245565a910 
+	dq 0xf40e35855771202a,0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 
+	dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 
+	dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc,0x78a5636f43172f60 
+	dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	dq 0x90befffa23631e28,0xa4506cebde82bde9 
+	dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	dq 0xca273eceea26619c,0xd186b8c721c0c207 
+	dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 
+	dq 0x113f9804bef90dae,0x1b710b35131c471b
+	dq 0x28db77f523047d84,0x32caab7b40c72493 
+	dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 
+	dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+