summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/md5_mb
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/md5_mb')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm55
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm73
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm243
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm251
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm313
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm244
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c44
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm222
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm235
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm280
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm223
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c151
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c196
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c291
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c223
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c123
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm850
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm782
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm778
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm917
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c193
29 files changed, 7934 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
new file mode 100644
index 000000000..8001e4310
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
@@ -0,0 +1,83 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += md5_mb/md5_ctx_sse.c \
+ md5_mb/md5_ctx_avx.c \
+ md5_mb/md5_ctx_avx2.c
+
+lsrc += md5_mb/md5_mb_mgr_init_sse.c \
+ md5_mb/md5_mb_mgr_init_avx2.c \
+ md5_mb/md5_mb_mgr_init_avx512.c
+
+lsrc += md5_mb/md5_mb_mgr_submit_sse.asm \
+ md5_mb/md5_mb_mgr_submit_avx.asm \
+ md5_mb/md5_mb_mgr_submit_avx2.asm \
+ md5_mb/md5_mb_mgr_flush_sse.asm \
+ md5_mb/md5_mb_mgr_flush_avx.asm \
+ md5_mb/md5_mb_mgr_flush_avx2.asm \
+ md5_mb/md5_mb_x4x2_sse.asm \
+ md5_mb/md5_mb_x4x2_avx.asm \
+ md5_mb/md5_mb_x8x2_avx2.asm \
+ md5_mb/md5_multibinary.asm
+
+lsrc += md5_mb/md5_mb_mgr_submit_avx512.asm \
+ md5_mb/md5_mb_mgr_flush_avx512.asm \
+ md5_mb/md5_mb_x16x2_avx512.asm \
+ md5_mb/md5_ctx_avx512.c
+
+extern_hdrs += include/md5_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ md5_mb/md5_job.asm \
+ md5_mb/md5_mb_mgr_datastruct.asm \
+ md5_mb/md5_ref.c \
+ include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/memcpy_inline.h \
+ include/intrinreg.h
+
+check_tests += md5_mb/md5_mb_test \
+ md5_mb/md5_mb_rand_test \
+ md5_mb/md5_mb_rand_update_test
+
+unit_tests += md5_mb/md5_mb_rand_ssl_test
+
+perf_tests += md5_mb/md5_mb_vs_ossl_perf
+
+
+md5_mb_rand_test: md5_ref.o
+md5_mb_md5_mb_rand_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_update_test: md5_ref.o
+md5_mb_md5_mb_rand_update_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_ssl_test: LDLIBS += -lcrypto
+md5_mb_md5_mb_rand_ssl_test_LDFLAGS = -lcrypto
+md5_mb_vs_ossl_perf: LDLIBS += -lcrypto
+md5_mb_md5_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
new file mode 100644
index 000000000..2125be63b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx_slver_02020183;
+struct slver md5_ctx_mgr_init_avx_slver = { 0x0183, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_submit_avx_slver_02020184;
+struct slver md5_ctx_mgr_submit_avx_slver = { 0x0184, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_flush_avx_slver_02020185;
+struct slver md5_ctx_mgr_flush_avx_slver = { 0x0185, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
new file mode 100644
index 000000000..71618a3c8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx2(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx2_slver_04020186;
+struct slver md5_ctx_mgr_init_avx2_slver = { 0x0186, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_submit_avx2_slver_04020187;
+struct slver md5_ctx_mgr_submit_avx2_slver = { 0x0187, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_flush_avx2_slver_04020188;
+struct slver md5_ctx_mgr_flush_avx2_slver = { 0x0188, 0x02, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
new file mode 100644
index 000000000..a7f54c2b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
@@ -0,0 +1,253 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx512(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx512_slver_0600018c;
+struct slver md5_ctx_mgr_init_avx512_slver = { 0x018c, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_submit_avx512_slver_0600018d;
+struct slver md5_ctx_mgr_submit_avx512_slver = { 0x018d, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_flush_avx512_slver_0600018e;
+struct slver md5_ctx_mgr_flush_avx512_slver = { 0x018e, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
new file mode 100644
index 000000000..8688dfc37
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_sse(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_sse(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_sse(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_sse(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_sse_slver_00020180;
+struct slver md5_ctx_mgr_init_sse_slver = { 0x0180, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_submit_sse_slver_00020181;
+struct slver md5_ctx_mgr_submit_sse_slver = { 0x0181, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_flush_sse_slver_00020182;
+struct slver md5_ctx_mgr_flush_sse_slver = { 0x0182, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
new file mode 100644
index 000000000..9f4c510c2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
@@ -0,0 +1,55 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define JOB_MD5 structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; JOB_MD5
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 4, 4 ; length in bytes
+FIELD _result_digest, 4*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+%assign _JOB_MD5_size _FIELD_OFFSET
+%assign _JOB_MD5_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..63743cef5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
@@ -0,0 +1,73 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define MD5 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MD5_ARGS_X32
+;;; name size align
+FIELD _digest, 4*4*32, 16 ; transposed digest
+FIELD _data_ptr, 8*32, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _MD5_ARGS_X8_size _FIELD_OFFSET
+%assign _MD5_ARGS_X8_align _STRUCT_ALIGN
+%assign _MD5_ARGS_X16_size _FIELD_OFFSET
+%assign _MD5_ARGS_X16_align _STRUCT_ALIGN
+%assign _MD5_ARGS_X32_size _FIELD_OFFSET
+%assign _MD5_ARGS_X32_align _STRUCT_ALIGN
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _MD5_ARGS_X8_size, _MD5_ARGS_X8_align
+FIELD _lens, 4*32, 8
+FIELD _unused_lanes, 8*4, 8
+FIELD _ldata, _LANE_DATA_size*32, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..7b681136c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
@@ -0,0 +1,243 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_avx
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx r8
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_avx:function
+md5_mb_mgr_flush_avx:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..ecc283193
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
@@ -0,0 +1,251 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x8x2_avx2
+%define idx rbp
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx2(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_avx2:function
+md5_mb_mgr_flush_avx2:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..e8d4ca03c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
@@ -0,0 +1,313 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x16_avx512
+%define idx rbp
+
+%define unused_lanes ymm7
+%define lane r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr + 1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr-1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr], byte 0
+%endmacro
+
+align 64
+default rel
+section .text
+
+; JOB* md5_mb_mgr_flush_avx512(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_avx512:function
+md5_mb_mgr_flush_avx512:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 31
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 32
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ ; Find min length
+ vmovdqu ymm5, [state + _lens + 2*32]
+ vmovdqu ymm6, [state + _lens + 3*32]
+
+ vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+ vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+
+ vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0x3F
+ shr len2, 6
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_6bits]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+ vpsubd ymm5, ymm5, ymm2
+ vpsubd ymm6, ymm6, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+ vmovdqu [state + _lens + 2*32], ymm5
+ vmovdqu [state + _lens + 3*32], ymm6
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x16x2_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov lane, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+
+ shl lane, 8
+ or lane, idx
+ MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes
+ mov [state + _unused_lanes], lane
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_6bits:
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+lane_16: dq 16
+lane_17: dq 17
+lane_18: dq 18
+lane_19: dq 19
+lane_20: dq 20
+lane_21: dq 21
+lane_22: dq 22
+lane_23: dq 23
+lane_24: dq 24
+lane_25: dq 25
+lane_26: dq 26
+lane_27: dq 27
+lane_28: dq 28
+lane_29: dq 29
+lane_30: dq 30
+lane_31: dq 31
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_flush_avx512
+no_md5_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..7ee81616f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
@@ -0,0 +1,244 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx r8
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_sse(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_sse:function
+md5_mb_mgr_flush_sse:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ movdqa xmm0, [state + _lens + 0*16]
+ movdqa xmm1, [state + _lens + 1*16]
+
+ movdqa xmm2, xmm0
+ pminud xmm2, xmm1 ; xmm2 has {D,C,B,A}
+ palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ pminud xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ pminud xmm2, xmm3 ; xmm2 has min value in low dword
+
+ movd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ pand xmm2, [rel clear_low_nibble]
+ pshufd xmm2, xmm2, 0
+
+ psubd xmm0, xmm2
+ psubd xmm1, xmm2
+
+ movdqa [state + _lens + 0*16], xmm0
+ movdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*32]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..b2e983362
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx2(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < 16; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..e83b2e38f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
@@ -0,0 +1,44 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx512(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0x0706050403020100;
+ state->unused_lanes[1] = 0x0f0e0d0c0b0a0908;
+ state->unused_lanes[2] = 0x1716151413121110;
+ state->unused_lanes[3] = 0x1f1e1d1c1b1a1918;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < 32; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
new file mode 100644
index 000000000..049d2147d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_sse(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0xF76543210;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..5663942bf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
@@ -0,0 +1,222 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+default rel
+
+extern md5_mb_x4x2_avx
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx r8
+
+%define p r9
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane r10
+
+%define lane_data r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_avx:function
+md5_mb_mgr_submit_avx:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*32], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..9279b855d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
@@ -0,0 +1,235 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define lane rsi
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%define lane rdx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x8x2_avx2
+%define idx rbp
+
+%define p r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define num_lanes_inuse r9
+
+%define lane_data r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_avx2:function
+md5_mb_mgr_submit_avx2:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*64], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..40102ccce
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
@@ -0,0 +1,280 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define lane rsi
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%define lane rdx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x16_avx512
+%define idx rbp
+
+%define p r11
+
+%define unused_lanes ymm7
+
+%define job_rax rax
+%define len rax
+
+%define num_lanes_inuse r9
+
+%define lane_data r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr + 1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr-1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr], byte 0
+%endmacro
+
+align 64
+default rel
+section .text
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_avx512:function
+md5_mb_mgr_submit_avx512:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov lane, [state + _unused_lanes]
+ and lane, 0x3F
+ MEM_VPSRLDDQ (state + _unused_lanes), unused_lanes
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(len), [job + _len]
+
+ shl len, 6 ; low 5 bits store idx
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16*2], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16*2], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16*2], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16*2], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 32
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ ; Find min length
+ vmovdqu ymm5, [state + _lens + 2*32]
+ vmovdqu ymm6, [state + _lens + 3*32]
+
+ vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+ vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+
+ vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0x3F
+ shr len2, 6
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_6bits]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+ vpsubd ymm5, ymm5, ymm2
+ vpsubd ymm6, ymm6, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+ vmovdqu [state + _lens + 2*32], ymm5
+ vmovdqu [state + _lens + 3*32], ymm6
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x16x2_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov lane, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+
+ shl lane, 8
+ or lane, idx
+ MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes
+ mov [state + _unused_lanes], lane
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_6bits:
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_submit_avx512
+no_md5_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..e15118583
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
@@ -0,0 +1,223 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx r8
+
+%define p r9
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane r10
+
+%define lane_data r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_sse:function
+md5_mb_mgr_submit_sse:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ movdqa [rsp + 8*8 + 16*0], xmm6
+ movdqa [rsp + 8*8 + 16*1], xmm7
+ movdqa [rsp + 8*8 + 16*2], xmm8
+ movdqa [rsp + 8*8 + 16*3], xmm9
+ movdqa [rsp + 8*8 + 16*4], xmm10
+ movdqa [rsp + 8*8 + 16*5], xmm11
+ movdqa [rsp + 8*8 + 16*6], xmm12
+ movdqa [rsp + 8*8 + 16*7], xmm13
+ movdqa [rsp + 8*8 + 16*8], xmm14
+ movdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqu xmm0, [job + _result_digest + 0*16]
+ movd [state + _args_digest + 4*lane + 0*32], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _lens + 0*16]
+ movdqa xmm1, [state + _lens + 1*16]
+
+ movdqa xmm2, xmm0
+ pminud xmm2, xmm1 ; xmm2 has {D,C,B,A}
+ palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ pminud xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ pminud xmm2, xmm3 ; xmm2 has min value in low dword
+
+ movd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ pand xmm2, [rel clear_low_nibble]
+ pshufd xmm2, xmm2, 0
+
+ psubd xmm0, xmm2
+ psubd xmm1, xmm2
+
+ movdqa [state + _lens + 0*16], xmm0
+ movdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*32]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*8 + 16*0]
+ movdqa xmm7, [rsp + 8*8 + 16*1]
+ movdqa xmm8, [rsp + 8*8 + 16*2]
+ movdqa xmm9, [rsp + 8*8 + 16*3]
+ movdqa xmm10, [rsp + 8*8 + 16*4]
+ movdqa xmm11, [rsp + 8*8 + 16*5]
+ movdqa xmm12, [rsp + 8*8 + 16*6]
+ movdqa xmm13, [rsp + 8*8 + 16*7]
+ movdqa xmm14, [rsp + 8*8 + 16*8]
+ movdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
new file mode 100644
index 000000000..5efeda710
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
@@ -0,0 +1,151 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+
+ printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != ((uint32_t *) digest_ssl[i])[j]) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ ((uint32_t *) digest_ssl[i])[j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Ramdom buffer with ramdom len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ MD5(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ ((uint32_t *) digest_ssl[i])[j]) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ ((uint32_t *) digest_ssl[i])[j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
new file mode 100644
index 000000000..451bcbc13
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
@@ -0,0 +1,196 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+
+ printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run md5_mb test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ md5_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
new file mode 100644
index 000000000..4737a94bf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
@@ -0,0 +1,291 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*MD5_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*MD5_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+
+ printf("multibinary_md5_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_md5 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = md5_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = md5_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = md5_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ md5_ctx_mgr_init(mgr);
+
+ // Run md5_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = MD5_BLOCK_SIZE +
+ MD5_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % MD5_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = md5_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = md5_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % MD5_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = md5_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
new file mode 100644
index 000000000..bd1ad8e0c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
@@ -0,0 +1,223 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "md5_mb.h"
+
+typedef uint32_t DigestMD5[MD5_DIGEST_NWORDS];
+
+#define MSGS 13
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "Test vector from febooti.com";
+static uint8_t msg2[] = "12345678901234567890" "12345678901234567890"
+ "12345678901234567890" "12345678901234567890";
+static uint8_t msg3[] = "";
+static uint8_t msg4[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msg5[] = "message digest";
+static uint8_t msg6[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msg7[] = "abc";
+static uint8_t msg8[] = "a";
+
+static uint8_t msg9[] = "";
+static uint8_t msgA[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msgB[] = "message digest";
+static uint8_t msgC[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msgD[] = "abc";
+
+static DigestMD5 expResultDigest1 = { 0x61b60a50, 0xfbb76d3c, 0xf5620cd3, 0x0f3d57ff };
+static DigestMD5 expResultDigest2 = { 0xa2f4ed57, 0x55c9e32b, 0x2eda49ac, 0x7ab60721 };
+static DigestMD5 expResultDigest3 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigest4 = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigest5 = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigest6 = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigest7 = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+static DigestMD5 expResultDigest8 = { 0xb975c10c, 0xa8b6f1c0, 0xe299c331, 0x61267769 };
+
+static DigestMD5 expResultDigest9 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigestA = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigestB = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigestC = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigestD = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9,
+ msgA, msgB, msgC, msgD
+};
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7, expResultDigest8, expResultDigest9,
+ expResultDigestA, expResultDigestB, expResultDigestC,
+ expResultDigestD
+};
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = md5_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = md5_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_md5 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..7e9acde28
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
@@ -0,0 +1,123 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("md5_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_md5" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != ((uint32_t *) digest_ssl[i])[j]) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ ((uint32_t *) digest_ssl[i])[j]);
+ }
+ }
+ }
+
+ printf("Multi-buffer md5 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
new file mode 100644
index 000000000..7ce641409
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
@@ -0,0 +1,850 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+default rel
+
+;; code to compute double octal MD5 using AVX512
+
+;; Stack must be aligned to 64 bytes before call
+
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers zmm0-8, 14-31
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) global func_name:function internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1 rbx
+%define DPTR2 var2
+%define IDX var1
+%define TBL rax
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; Transposed Digest Storage
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define A1 zmm4
+%define B1 zmm5
+%define C1 zmm6
+%define D1 zmm7
+
+%define md5c zmm16
+
+%define MASK0 zmm17
+%define MASK1 zmm18
+
+%define TMP0 zmm20
+%define TMP1 zmm21
+
+
+;; Data are stored into the Wx after transposition
+%define W0 zmm8
+%define W1 zmm9
+%define W2 zmm10
+%define W3 zmm11
+%define W4 zmm12
+%define W5 zmm13
+%define W6 zmm14
+%define W7 zmm15
+
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+%define APPEND(a,b) a %+ b
+%define APPEND3(a,b,c) a %+ b %+ c
+
+;; Temporary registers used during data transposition
+
+%define RESZ resb 64*
+;; Assume stack aligned to 64 bytes before call
+;; Therefore FRAMESIZE mod 64 must be 64-8 = 56
+struc STACK
+_DATA: RESZ 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: RESZ 8 ; stores Z_AA-Z_DD, Z_AA2-Z_DD2
+_TMPDIGEST: RESZ 2 ; stores Z_AA, Z_BB temporarily
+_RSP_SAVE: RESQ 1 ; original RSP
+endstruc
+
+%define Z_AA rsp + _DIGEST + 64*0
+%define Z_BB rsp + _DIGEST + 64*1
+%define Z_CC rsp + _DIGEST + 64*2
+%define Z_DD rsp + _DIGEST + 64*3
+%define Z_AA1 rsp + _DIGEST + 64*4
+%define Z_BB1 rsp + _DIGEST + 64*5
+%define Z_CC1 rsp + _DIGEST + 64*6
+%define Z_DD1 rsp + _DIGEST + 64*7
+
+%define MD5_DIGEST_ROW_SIZE (32*4)
+
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+%define rot11 7
+%define rot12 12
+%define rot13 17
+%define rot14 22
+%define rot21 5
+%define rot22 9
+%define rot23 14
+%define rot24 20
+%define rot31 4
+%define rot32 11
+%define rot33 16
+%define rot34 23
+%define rot41 6
+%define rot42 10
+%define rot43 15
+%define rot44 21
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, MASK0
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, MASK1
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, MASK0
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, MASK1
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, MASK0
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, MASK1
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, MASK0
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, MASK1
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, MASK0
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, MASK1
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, MASK0
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, MASK1
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, MASK0
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, MASK1
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, MASK0
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, MASK1
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_ARGS1 0
+%xdefine TMP_ D1
+%xdefine D1 C1
+%xdefine C1 B1
+%xdefine B1 A1
+%xdefine A1 TMP_
+%endm
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +Ft(B,C,D) +data +const), nrot)
+;;eg: PROCESS_LOOP MD5constx, Mdatax, F_IMMEDx, NROTx
+%macro PROCESS_LOOP 6
+%define %%MD5const %1
+%define %%data %2
+%define %%F_IMMED %3
+%define %%NROT %4
+%define %%TMP_PR0 %5
+%define %%TMP_PR1 %6
+ ; a=b+((a+Ft(b,c,d)+Mj+ti)<<s)
+
+ ; Ft
+ ; 0-15 Ft:F(X,Y,Z)=(X&Y)|((~X)&Z) 0xca
+ ; 16-31 Ft:G(X,Y,Z)=(X&Z)|(Y&(~Z)) 0xe4
+ ; 32-47 Ft:H(X,Y,Z)=X^Y^Z 0x96
+ ; 48-63 Ft:I(X,Y,Z)=Y^(X|(~Z)) 0x39
+
+ vpaddd A, A, %%MD5const
+ vpaddd A1, A1, %%MD5const
+ vpaddd A, A, [%%data]
+ vpaddd A1, A1, [%%data + 16*64]
+ vmovdqa32 %%TMP_PR0, B ; Copy B
+ vmovdqa32 %%TMP_PR1, B1 ; Copy B
+ vpternlogd %%TMP_PR0, C, D, %%F_IMMED
+ vpternlogd %%TMP_PR1, C1, D1, %%F_IMMED
+ vpaddd A, A, %%TMP_PR0
+ vpaddd A1, A1, %%TMP_PR1
+ vprold A, A, %%NROT
+ vprold A1, A1, %%NROT
+ vpaddd A, A, B
+ vpaddd A1, A1, B1
+
+ ROTATE_ARGS
+ ROTATE_ARGS1
+%endmacro
+
+align 64
+default rel
+section .text
+
+; void md5_mb_x16x2_avx512(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+local_func_decl(md5_mb_x16x2_avx512)
+md5_mb_x16x2_avx512:
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -64
+ mov [rsp + _RSP_SAVE], rax
+
+ mov DPTR1, rsp
+ lea DPTR2, [rsp + 64*32]
+
+ ;; Load MD5 constant pointer to register
+ lea TBL, [MD5_TABLE]
+ vmovdqa32 MASK0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vmovdqa32 MASK1, [PSHUFFLE_TRANSPOSE16_MASK2]
+
+ ;; Preload input data from 16 segments.
+ xor IDX, IDX
+
+ ;; transpose input onto stack
+ ;; first 16 lanes read
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+ ;; first 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR1+_DATA+(0)*64],W0
+ vmovdqa32 [DPTR1+_DATA+(1)*64],W1
+ vmovdqa32 [DPTR1+_DATA+(2)*64],W2
+ vmovdqa32 [DPTR1+_DATA+(3)*64],W3
+ vmovdqa32 [DPTR1+_DATA+(4)*64],W4
+ vmovdqa32 [DPTR1+_DATA+(5)*64],W5
+ vmovdqa32 [DPTR1+_DATA+(6)*64],W6
+ vmovdqa32 [DPTR1+_DATA+(7)*64],W7
+ vmovdqa32 [DPTR1+_DATA+(8)*64],W8
+ vmovdqa32 [DPTR1+_DATA+(9)*64],W9
+ vmovdqa32 [DPTR1+_DATA+(10)*64],W10
+ vmovdqa32 [DPTR1+_DATA+(11)*64],W11
+ vmovdqa32 [DPTR1+_DATA+(12)*64],W12
+ vmovdqa32 [DPTR1+_DATA+(13)*64],W13
+ vmovdqa32 [DPTR1+_DATA+(14)*64],W14
+ vmovdqa32 [DPTR1+_DATA+(15)*64],W15
+
+ ;; second 16 lanes read
+ mov inp0, [IN + 16*8]
+ mov inp1, [IN + 17*8]
+ mov inp2, [IN + 18*8]
+ mov inp3, [IN + 19*8]
+ mov inp4, [IN + 20*8]
+ mov inp5, [IN + 21*8]
+ mov inp6, [IN + 22*8]
+ mov inp7, [IN + 23*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 24*8]
+ mov inp1, [IN + 25*8]
+ mov inp2, [IN + 26*8]
+ mov inp3, [IN + 27*8]
+ mov inp4, [IN + 28*8]
+ mov inp5, [IN + 29*8]
+ mov inp6, [IN + 30*8]
+ mov inp7, [IN + 31*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+ ;; second 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR1+_DATA+(16+0)*64],W0
+ vmovdqa32 [DPTR1+_DATA+(16+1)*64],W1
+ vmovdqa32 [DPTR1+_DATA+(16+2)*64],W2
+ vmovdqa32 [DPTR1+_DATA+(16+3)*64],W3
+ vmovdqa32 [DPTR1+_DATA+(16+4)*64],W4
+ vmovdqa32 [DPTR1+_DATA+(16+5)*64],W5
+ vmovdqa32 [DPTR1+_DATA+(16+6)*64],W6
+ vmovdqa32 [DPTR1+_DATA+(16+7)*64],W7
+ vmovdqa32 [DPTR1+_DATA+(16+8)*64],W8
+ vmovdqa32 [DPTR1+_DATA+(16+9)*64],W9
+ vmovdqa32 [DPTR1+_DATA+(16+10)*64],W10
+ vmovdqa32 [DPTR1+_DATA+(16+11)*64],W11
+ vmovdqa32 [DPTR1+_DATA+(16+12)*64],W12
+ vmovdqa32 [DPTR1+_DATA+(16+13)*64],W13
+ vmovdqa32 [DPTR1+_DATA+(16+14)*64],W14
+ vmovdqa32 [DPTR1+_DATA+(16+15)*64],W15
+
+ ;; Initialize digests
+ ;; vmovdqu32 replace vmovdqa32
+ vmovdqu32 A, [DIGEST + 0 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [DIGEST + 1 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [DIGEST + 2 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [DIGEST + 3 * MD5_DIGEST_ROW_SIZE]
+ ; Load the digest for each stream (9-16)
+ vmovdqu32 A1,[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 B1,[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 C1,[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 D1,[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64]
+
+.lloop:
+ ;; Increment IDX to point to next data block (64 bytes per block)
+ add IDX, 64
+
+ ; Save digests for later addition
+ vmovdqa32 [Z_AA], A
+ vmovdqa32 [Z_BB], B
+ vmovdqa32 [Z_CC], C
+ vmovdqa32 [Z_DD], D
+ vmovdqa32 [Z_AA1], A1
+ vmovdqa32 [Z_BB1], B1
+ vmovdqa32 [Z_CC1], C1
+ vmovdqa32 [Z_DD1], D1
+
+ sub SIZE, 1
+ je .LastLoop
+
+%assign I 0
+%assign I_fimm 0xCA
+%rep 16 ; 0<=I<=15
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data I
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+ ;; first 16 lanes read
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+
+%assign I 16
+%assign I_fimm 0xE4
+%rep 16 ; 16<=I<=31
+ %assign I_data ((5*I+1) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; first 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR2+_DATA+(0)*64],W0
+ vmovdqa32 [DPTR2+_DATA+(1)*64],W1
+ vmovdqa32 [DPTR2+_DATA+(2)*64],W2
+ vmovdqa32 [DPTR2+_DATA+(3)*64],W3
+ vmovdqa32 [DPTR2+_DATA+(4)*64],W4
+ vmovdqa32 [DPTR2+_DATA+(5)*64],W5
+ vmovdqa32 [DPTR2+_DATA+(6)*64],W6
+ vmovdqa32 [DPTR2+_DATA+(7)*64],W7
+ vmovdqa32 [DPTR2+_DATA+(8)*64],W8
+ vmovdqa32 [DPTR2+_DATA+(9)*64],W9
+ vmovdqa32 [DPTR2+_DATA+(10)*64],W10
+ vmovdqa32 [DPTR2+_DATA+(11)*64],W11
+ vmovdqa32 [DPTR2+_DATA+(12)*64],W12
+ vmovdqa32 [DPTR2+_DATA+(13)*64],W13
+ vmovdqa32 [DPTR2+_DATA+(14)*64],W14
+ vmovdqa32 [DPTR2+_DATA+(15)*64],W15
+
+%assign I 32
+%assign I_fimm 0x96
+%rep 16 ; 32<=I<=47
+ %assign I_data ((3*I+5) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; second 16 lanes read
+ mov inp0, [IN + 16*8]
+ mov inp1, [IN + 17*8]
+ mov inp2, [IN + 18*8]
+ mov inp3, [IN + 19*8]
+ mov inp4, [IN + 20*8]
+ mov inp5, [IN + 21*8]
+ mov inp6, [IN + 22*8]
+ mov inp7, [IN + 23*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 24*8]
+ mov inp1, [IN + 25*8]
+ mov inp2, [IN + 26*8]
+ mov inp3, [IN + 27*8]
+ mov inp4, [IN + 28*8]
+ mov inp5, [IN + 29*8]
+ mov inp6, [IN + 30*8]
+ mov inp7, [IN + 31*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+
+%assign I 48
+%assign I_fimm 0x39
+%rep 16 ; 48<=I<=63
+ %assign I_rotX (I/16+1)
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data ((7*I) % 16)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; second 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR2+_DATA+(16+0)*64],W0
+ vmovdqa32 [DPTR2+_DATA+(16+1)*64],W1
+ vmovdqa32 [DPTR2+_DATA+(16+2)*64],W2
+ vmovdqa32 [DPTR2+_DATA+(16+3)*64],W3
+ vmovdqa32 [DPTR2+_DATA+(16+4)*64],W4
+ vmovdqa32 [DPTR2+_DATA+(16+5)*64],W5
+ vmovdqa32 [DPTR2+_DATA+(16+6)*64],W6
+ vmovdqa32 [DPTR2+_DATA+(16+7)*64],W7
+ vmovdqa32 [DPTR2+_DATA+(16+8)*64],W8
+ vmovdqa32 [DPTR2+_DATA+(16+9)*64],W9
+ vmovdqa32 [DPTR2+_DATA+(16+10)*64],W10
+ vmovdqa32 [DPTR2+_DATA+(16+11)*64],W11
+ vmovdqa32 [DPTR2+_DATA+(16+12)*64],W12
+ vmovdqa32 [DPTR2+_DATA+(16+13)*64],W13
+ vmovdqa32 [DPTR2+_DATA+(16+14)*64],W14
+ vmovdqa32 [DPTR2+_DATA+(16+15)*64],W15
+
+ ; Add old digest
+ vpaddd A,A,[Z_AA]
+ vpaddd B,B,[Z_BB]
+ vpaddd C,C,[Z_CC]
+ vpaddd D,D,[Z_DD]
+ vpaddd A1,A1,[Z_AA1]
+ vpaddd B1,B1,[Z_BB1]
+ vpaddd C1,C1,[Z_CC1]
+ vpaddd D1,D1,[Z_DD1]
+
+ ; Swap DPTR1 and DPTR2
+ xchg DPTR1, DPTR2
+ ;; Proceed to processing of next block
+ jmp .lloop
+
+.LastLoop:
+%assign I 0
+%assign I_fimm 0xCA
+%rep 16 ; 0<=I<=15
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data I
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 16
+%assign I_fimm 0xE4
+%rep 16 ; 16<=I<=31
+ %assign I_data ((5*I+1) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 32
+%assign I_fimm 0x96
+%rep 16 ; 32<=I<=47
+ %assign I_data ((3*I+5) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 48
+%assign I_fimm 0x39
+%rep 16 ; 48<=I<=63
+ %assign I_rotX (I/16+1)
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data ((7*I) % 16)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,[Z_AA]
+ vpaddd B,B,[Z_BB]
+ vpaddd C,C,[Z_CC]
+ vpaddd D,D,[Z_DD]
+ vpaddd A1,A1,[Z_AA1]
+ vpaddd B1,B1,[Z_BB1]
+ vpaddd C1,C1,[Z_CC1]
+ vpaddd D1,D1,[Z_DD1]
+
+ ;; update into data pointers
+%assign I 0
+%rep 16
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ vmovdqu32 [DIGEST + 0*MD5_DIGEST_ROW_SIZE ], A
+ vmovdqu32 [DIGEST + 1*MD5_DIGEST_ROW_SIZE ], B
+ vmovdqu32 [DIGEST + 2*MD5_DIGEST_ROW_SIZE ], C
+ vmovdqu32 [DIGEST + 3*MD5_DIGEST_ROW_SIZE ], D
+ ; Store the digest for each stream (9-16)
+ vmovdqu32 [DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64], A1
+ vmovdqu32 [DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64], B1
+ vmovdqu32 [DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64], C1
+ vmovdqu32 [DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64], D1
+
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+section .data
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
+ dd 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
+ dd 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+ dd 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+ dd 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+ dd 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+ dd 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+ dd 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+ dd 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+ dd 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+ dd 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+ dd 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+ dd 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+ dd 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+ dd 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+ dd 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_x16x2_avx512
+no_md5_mb_x16x2_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
new file mode 100644
index 000000000..1b4927909
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
@@ -0,0 +1,782 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using AVX
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z,[ONES] ; pnot %%F
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ ;movdqa %%tmp, %%reg
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, [%%data]
+ vpaddd %%A2, %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ vpaddd %%A, %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ vpaddd %%A2, %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vmovdqa %%TMP,[%%data]
+ vmovdqa %%TMP2,[%%data + 16*16]
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, %%TMP
+ vpaddd %%A2, %%A2, %%TMP2
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ vpaddd %%A, %%A, %%FUN
+ vpaddd %%A2, %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define TBL rax
+%define IDX rbx
+%define mem2 rbp
+
+
+
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+MEM equ 16*16*2*2 ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE equ MEM + 16*8 + 8
+
+%define AA rsp + MEM + 16*0
+%define BB rsp + MEM + 16*1
+%define CC rsp + MEM + 16*2
+%define DD rsp + MEM + 16*3
+%define AA2 rsp + MEM + 16*4
+%define BB2 rsp + MEM + 16*5
+%define CC2 rsp + MEM + 16*6
+%define DD2 rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+; DECLARE_ALIGNED(digest_array digest, 16);
+; UINT8* data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_avx(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+global md5_mb_x4x2_avx:function internal
+md5_mb_x4x2_avx:
+ sub rsp, STACK_SIZE
+
+ ;; Initialize digests
+ vmovdqu A,[arg1+0*16]
+ vmovdqu B,[arg1+2*16]
+ vmovdqu C,[arg1+4*16]
+ vmovdqu D,[arg1+6*16]
+
+ vmovdqu A2,[arg1+1*16]
+ vmovdqu B2,[arg1+3*16]
+ vmovdqu C2,[arg1+5*16]
+ vmovdqu D2,[arg1+7*16]
+
+ lea TBL, [MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+ mov inp4,[arg1 + _data_ptr + 4*8]
+ mov inp5,[arg1 + _data_ptr + 5*8]
+ mov inp6,[arg1 + _data_ptr + 6*8]
+ mov inp7,[arg1 + _data_ptr + 7*8]
+
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16],T0
+ vmovdqa [mem1+(I*4+1)*16],T1
+ vmovdqa [mem1+(I*4+2)*16],T2
+ vmovdqa [mem1+(I*4+3)*16],T3
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem1+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem1+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+
+ ; save old digests
+ vmovdqa [AA], A
+ vmovdqa [BB], B
+ vmovdqa [CC], C
+ vmovdqa [DD], D
+ ; save old digests
+ vmovdqa [AA2], A2
+ vmovdqa [BB2], B2
+ vmovdqa [CC2], C2
+ vmovdqa [DD2], D2
+
+ add IDX, 4*16
+ sub arg2, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+15*16, [TBL+15*16], rot14
+
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 4*16, [TBL+23*16], rot24
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+12*16, [TBL+31*16], rot24
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+10*16, [TBL+39*16], rot34
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 2*16, [TBL+47*16], rot34
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 1*16, [TBL+55*16], rot44
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 9*16, [TBL+63*16], rot44
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+63*16], rot44
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; write out digests
+ vmovdqu [arg1+0*16], A
+ vmovdqu [arg1+2*16], B
+ vmovdqu [arg1+4*16], C
+ vmovdqu [arg1+6*16], D
+
+ vmovdqu [arg1+1*16], A2
+ vmovdqu [arg1+3*16], B2
+ vmovdqu [arg1+5*16], C2
+ vmovdqu [arg1+7*16], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ mov [arg1 + _data_ptr + 1*8], inp1
+ mov [arg1 + _data_ptr + 2*8], inp2
+ mov [arg1 + _data_ptr + 3*8], inp3
+ mov [arg1 + _data_ptr + 4*8], inp4
+ mov [arg1 + _data_ptr + 5*8], inp5
+ mov [arg1 + _data_ptr + 6*8], inp6
+ mov [arg1 + _data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+
+ONES:
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
new file mode 100644
index 000000000..f3fc29eca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
@@ -0,0 +1,778 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using SSE
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movdqa %%t0, %%r0
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movdqa %%t1, %%r2
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movdqa %%r1, %%t0
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movdqa %%r3, %%r0
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pand %%F,%%X
+ pxor %%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pxor %%F,%%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,[ONES] ; pnot %%F
+ por %%F,%%X
+ pxor %%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%tmp, (32-%%imm)
+ pslld %%reg, %%imm
+ por %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ paddd %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ paddd %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ paddd %%A, %%FUN
+ paddd %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%define TBL rax
+%define IDX rbx
+%define mem2 rbp
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+MEM equ 16*16*2*2 ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE equ MEM + 16*8 + 8
+
+%define AA rsp + MEM + 16*0
+%define BB rsp + MEM + 16*1
+%define CC rsp + MEM + 16*2
+%define DD rsp + MEM + 16*3
+%define AA2 rsp + MEM + 16*4
+%define BB2 rsp + MEM + 16*5
+%define CC2 rsp + MEM + 16*6
+%define DD2 rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+; DECLARE_ALIGNED(digest_array digest, 16);
+; UINT8* data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_sse(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+global md5_mb_x4x2_sse:function internal
+md5_mb_x4x2_sse:
+ sub rsp, STACK_SIZE
+
+ ;; Initialize digests
+ movdqu A,[arg1+0*16]
+ movdqu B,[arg1+2*16]
+ movdqu C,[arg1+4*16]
+ movdqu D,[arg1+6*16]
+
+ ;; Initialize digests
+ movdqu A2,[arg1+1*16]
+ movdqu B2,[arg1+3*16]
+ movdqu C2,[arg1+5*16]
+ movdqu D2,[arg1+7*16]
+
+ lea TBL, [MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+ mov inp4,[arg1 + _data_ptr + 4*8]
+ mov inp5,[arg1 + _data_ptr + 5*8]
+ mov inp6,[arg1 + _data_ptr + 6*8]
+ mov inp7,[arg1 + _data_ptr + 7*8]
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16],T0
+ movdqa [mem1+(I*4+1)*16],T1
+ movdqa [mem1+(I*4+2)*16],T2
+ movdqa [mem1+(I*4+3)*16],T3
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16 + 16*16],T0
+ movdqa [mem1+(I*4+1)*16 + 16*16],T1
+ movdqa [mem1+(I*4+2)*16 + 16*16],T2
+ movdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+ ; save old digests
+ movdqa [AA], A
+ movdqa [BB], B
+ movdqa [CC], C
+ movdqa [DD], D
+ ; save old digests
+ movdqa [AA2], A2
+ movdqa [BB2], B2
+ movdqa [CC2], C2
+ movdqa [DD2], D2
+
+ add IDX, 4*16
+ sub arg2, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; write out digests
+ movdqu [arg1+0*16], A
+ movdqu [arg1+2*16], B
+ movdqu [arg1+4*16], C
+ movdqu [arg1+6*16], D
+ movdqu [arg1+1*16], A2
+ movdqu [arg1+3*16], B2
+ movdqu [arg1+5*16], C2
+ movdqu [arg1+7*16], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ mov [arg1 + _data_ptr + 1*8], inp1
+ mov [arg1 + _data_ptr + 2*8], inp2
+ mov [arg1 + _data_ptr + 3*8], inp3
+ mov [arg1 + _data_ptr + 4*8], inp4
+ mov [arg1 + _data_ptr + 5*8], inp5
+ mov [arg1 + _data_ptr + 6*8], inp6
+ mov [arg1 + _data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+
+ONES:
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
new file mode 100644
index 000000000..818c0ebbf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
@@ -0,0 +1,917 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute double octal MD5 using AVX2
+
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers ymm0-15
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rdi
+ %define reg4 rsi
+%else
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%endif
+
+;; rbp is not clobbered
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1 rbx
+%define DPTR2 reg3
+
+%define TBL rax
+%define IDX reg4
+
+;; Transposed Digest Storage
+%define Y_A ymm0
+%define Y_B ymm1
+%define Y_C ymm2
+%define Y_D ymm3
+%define Y_A2 ymm4
+%define Y_B2 ymm5
+%define Y_C2 ymm6
+%define Y_D2 ymm7
+
+;; Temp YMM registers corresponding to the Temp XMM registers
+;; used during the transposition of the digests
+%define Y_KTMP1 ymm12
+%define Y_KTMP2 ymm13
+;; Temporary registers used during MD5 round operations
+%define Y_FUN ymm8
+%define Y_TMP ymm9
+%define Y_FUN2 ymm10
+%define Y_TMP2 ymm11
+
+
+;; YMM registers used during data fetching.
+;; Data are stored into the stack after transposition
+%define Y_DAT0 ymm8
+%define Y_DAT1 ymm9
+%define Y_DAT2 ymm10
+%define Y_DAT3 ymm11
+%define Y_DAT4 ymm12
+%define Y_DAT5 ymm13
+%define Y_DAT6 ymm14
+%define Y_DAT7 ymm15
+
+;; Temporary registers used during data transposition
+%define Y_DTMP1 ymm0
+%define Y_DTMP2 ymm1
+
+
+%define RESY resb 32*
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
+struc STACK
+_DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2
+_TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily
+_RSP_SAVE: RESQ 1 ; original RSP
+endstruc
+
+
+%define Y_AA rsp + _DIGEST + 32*0
+%define Y_BB rsp + _DIGEST + 32*1
+%define Y_CC rsp + _DIGEST + 32*2
+%define Y_DD rsp + _DIGEST + 32*3
+%define Y_AA2 rsp + _DIGEST + 32*4
+%define Y_BB2 rsp + _DIGEST + 32*5
+%define Y_CC2 rsp + _DIGEST + 32*6
+%define Y_DD2 rsp + _DIGEST + 32*7
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z,[ONES] ; pnot %%F
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%rA %2
+%define %%rB %3
+%define %%rC %4
+%define %%rD %5
+%define %%rA2 %6
+%define %%rB2 %7
+%define %%rC2 %8
+%define %%rD2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vpaddd %%rA, %%rA, %%MD5const
+ vpaddd %%rA2, %%rA2, %%MD5const
+ vpaddd %%rA, %%rA, [%%data]
+ vpaddd %%rA2, %%rA2, [%%data + 16*32]
+ %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
+ %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
+ vpaddd %%rA, %%rA, %%FUN
+ vpaddd %%rA2, %%rA2, %%FUN2
+ PROLD %%rA,%%nrot, %%TMP
+ PROLD %%rA2,%%nrot, %%TMP2
+ vpaddd %%rA, %%rA, %%rB
+ vpaddd %%rA2, %%rA2, %%rB2
+%endmacro
+
+align 32
+
+; void md5_mb_x8x2_avx2(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+global md5_mb_x8x2_avx2:function internal
+md5_mb_x8x2_avx2:
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _RSP_SAVE], rax
+
+ mov DPTR1, rsp
+ lea DPTR2, [rsp + 32*32]
+
+ ;; Load MD5 constant pointer to register
+ lea TBL, [MD5_TABLE]
+
+ ; Initialize index for data retrieval
+ xor IDX, IDX
+
+ ;; Fetch Pointers to Data Stream 1 to 8
+ mov inp0,[state + _data_ptr + 0*8]
+ mov inp1,[state + _data_ptr + 1*8]
+ mov inp2,[state + _data_ptr + 2*8]
+ mov inp3,[state + _data_ptr + 3*8]
+ mov inp4,[state + _data_ptr + 4*8]
+ mov inp5,[state + _data_ptr + 5*8]
+ mov inp6,[state + _data_ptr + 6*8]
+ mov inp7,[state + _data_ptr + 7*8]
+
+%assign I 0
+%rep 2
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr + 8*8]
+ mov inp1,[state + _data_ptr + 9*8]
+ mov inp2,[state + _data_ptr + 10*8]
+ mov inp3,[state + _data_ptr + 11*8]
+ mov inp4,[state + _data_ptr + 12*8]
+ mov inp5,[state + _data_ptr + 13*8]
+ mov inp6,[state + _data_ptr + 14*8]
+ mov inp7,[state + _data_ptr + 15*8]
+
+%assign I 0
+%rep 2
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+ ;; digests are already transposed
+ vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
+
+ ; Load the digest for each stream (9-16)
+ vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
+
+lloop:
+
+ ; save old digests to stack
+ vmovdqa [Y_AA], Y_A
+ vmovdqa [Y_BB], Y_B
+ vmovdqa [Y_CC], Y_C
+ vmovdqa [Y_DD], Y_D
+
+ vmovdqa [Y_AA2], Y_A2
+ vmovdqa [Y_BB2], Y_B2
+ vmovdqa [Y_CC2], Y_C2
+ vmovdqa [Y_DD2], Y_D2
+
+ ;; Increment IDX to point to next data block (64 bytes per block)
+ add IDX, 64
+
+ ;; Update size of remaining blocks to process
+ sub num_blks, 1
+ je lastblock
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+
+
+ ;; Fetch Pointers to Data Stream 1 to 8 ??
+ mov inp0,[state + _data_ptr + 0*8]
+ mov inp1,[state + _data_ptr + 1*8]
+ mov inp2,[state + _data_ptr + 2*8]
+ mov inp3,[state + _data_ptr + 3*8]
+ mov inp4,[state + _data_ptr + 4*8]
+ mov inp5,[state + _data_ptr + 5*8]
+ mov inp6,[state + _data_ptr + 6*8]
+ mov inp7,[state + _data_ptr + 7*8]
+
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr + 8*8]
+ mov inp1,[state + _data_ptr + 9*8]
+ mov inp2,[state + _data_ptr + 10*8]
+ mov inp3,[state + _data_ptr + 11*8]
+ mov inp4,[state + _data_ptr + 12*8]
+ mov inp5,[state + _data_ptr + 13*8]
+ mov inp6,[state + _data_ptr + 14*8]
+ mov inp7,[state + _data_ptr + 15*8]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ ; Add results to old digest values
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+ ; Swap DPTR1 and DPTR2
+ xchg DPTR1, DPTR2
+
+ ;; Proceed to processing of next block
+ jmp lloop
+
+lastblock:
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [state + _data_ptr + (2*I)*8]
+ mov inp1, [state + _data_ptr + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [state + _data_ptr + (2*I)*8], inp0
+ mov [state + _data_ptr + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D
+
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2
+
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+
+
+ mov rsp, [rsp + _RSP_SAVE]
+
+ ret
+
+section .data
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
new file mode 100644
index 000000000..2c8212855
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
@@ -0,0 +1,83 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern md5_ctx_mgr_init_sse
+extern md5_ctx_mgr_submit_sse
+extern md5_ctx_mgr_flush_sse
+
+extern md5_ctx_mgr_init_avx
+extern md5_ctx_mgr_submit_avx
+extern md5_ctx_mgr_flush_avx
+
+extern md5_ctx_mgr_init_avx2
+extern md5_ctx_mgr_submit_avx2
+extern md5_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern md5_ctx_mgr_init_avx512
+ extern md5_ctx_mgr_submit_avx512
+ extern md5_ctx_mgr_flush_avx512
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface md5_ctx_mgr_init
+mbin_interface md5_ctx_mgr_submit
+mbin_interface md5_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6 md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2, md5_ctx_mgr_init_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2, md5_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2, md5_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2
+ mbin_dispatch_init md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2
+ mbin_dispatch_init md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2
+%endif
+
+;; func core, ver, snum
+slversion md5_ctx_mgr_init, 00, 03, 0189
+slversion md5_ctx_mgr_submit, 00, 03, 018a
+slversion md5_ctx_mgr_flush, 00, 03, 018b
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
new file mode 100644
index 000000000..9cb1fd646
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
@@ -0,0 +1,193 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference MD5 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+void md5_single(const uint8_t * data, uint32_t digest[4]);
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+
+void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[128];
+ union {
+ uint64_t uint;
+ uint8_t uchar[8];
+ } convert;
+ uint8_t *p;
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+
+ i = len;
+ while (i >= 64) {
+ md5_single(input_data, digest);
+ input_data += 64;
+ i -= 64;
+ }
+ // 0 <= i < 64
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < 120; j++)
+ buf[j] = 0;
+
+ if (i > 64 - 8)
+ i = 128;
+ else
+ i = 64;
+
+ convert.uint = 8 * len;
+ p = buf + i - 8;
+ p[7] = convert.uchar[7];
+ p[6] = convert.uchar[6];
+ p[5] = convert.uchar[5];
+ p[4] = convert.uchar[4];
+ p[3] = convert.uchar[3];
+ p[2] = convert.uchar[2];
+ p[1] = convert.uchar[1];
+ p[0] = convert.uchar[0];
+
+ md5_single(buf, digest);
+ if (i == 128)
+ md5_single(buf + 64, digest);
+}
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (c ^ (d & (b ^ c)))
+#define F3(b,c,d) (b ^ c ^ d)
+#define F4(b,c,d) (c ^ (b | ~d))
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define step(i,a,b,c,d,f,k,w,r) \
+ if (i < 16) {f = F1(b,c,d); } else \
+ if (i < 32) {f = F2(b,c,d); } else \
+ if (i < 48) {f = F3(b,c,d); } else \
+ {f = F4(b,c,d); } \
+ f = a + f + k + w; \
+ a = b + rol32(f, r);
+
+void md5_single(const uint8_t * data, uint32_t digest[4])
+{
+ uint32_t a, b, c, d;
+ uint32_t f;
+ uint32_t *w = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+
+ step(0, a, b, c, d, f, 0xd76aa478, w[0], 7);
+ step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12);
+ step(2, c, d, a, b, f, 0x242070db, w[2], 17);
+ step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22);
+ step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7);
+ step(5, d, a, b, c, f, 0x4787c62a, w[5], 12);
+ step(6, c, d, a, b, f, 0xa8304613, w[6], 17);
+ step(7, b, c, d, a, f, 0xfd469501, w[7], 22);
+ step(8, a, b, c, d, f, 0x698098d8, w[8], 7);
+ step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12);
+ step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17);
+ step(11, b, c, d, a, f, 0x895cd7be, w[11], 22);
+ step(12, a, b, c, d, f, 0x6b901122, w[12], 7);
+ step(13, d, a, b, c, f, 0xfd987193, w[13], 12);
+ step(14, c, d, a, b, f, 0xa679438e, w[14], 17);
+ step(15, b, c, d, a, f, 0x49b40821, w[15], 22);
+
+ step(16, a, b, c, d, f, 0xf61e2562, w[1], 5);
+ step(17, d, a, b, c, f, 0xc040b340, w[6], 9);
+ step(18, c, d, a, b, f, 0x265e5a51, w[11], 14);
+ step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20);
+ step(20, a, b, c, d, f, 0xd62f105d, w[5], 5);
+ step(21, d, a, b, c, f, 0x02441453, w[10], 9);
+ step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14);
+ step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20);
+ step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5);
+ step(25, d, a, b, c, f, 0xc33707d6, w[14], 9);
+ step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14);
+ step(27, b, c, d, a, f, 0x455a14ed, w[8], 20);
+ step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5);
+ step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9);
+ step(30, c, d, a, b, f, 0x676f02d9, w[7], 14);
+ step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20);
+
+ step(32, a, b, c, d, f, 0xfffa3942, w[5], 4);
+ step(33, d, a, b, c, f, 0x8771f681, w[8], 11);
+ step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16);
+ step(35, b, c, d, a, f, 0xfde5380c, w[14], 23);
+ step(36, a, b, c, d, f, 0xa4beea44, w[1], 4);
+ step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11);
+ step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16);
+ step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23);
+ step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4);
+ step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11);
+ step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16);
+ step(43, b, c, d, a, f, 0x04881d05, w[6], 23);
+ step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4);
+ step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11);
+ step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16);
+ step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23);
+
+ step(48, a, b, c, d, f, 0xf4292244, w[0], 6);
+ step(49, d, a, b, c, f, 0x432aff97, w[7], 10);
+ step(50, c, d, a, b, f, 0xab9423a7, w[14], 15);
+ step(51, b, c, d, a, f, 0xfc93a039, w[5], 21);
+ step(52, a, b, c, d, f, 0x655b59c3, w[12], 6);
+ step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10);
+ step(54, c, d, a, b, f, 0xffeff47d, w[10], 15);
+ step(55, b, c, d, a, f, 0x85845dd1, w[1], 21);
+ step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6);
+ step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10);
+ step(58, c, d, a, b, f, 0xa3014314, w[6], 15);
+ step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21);
+ step(60, a, b, c, d, f, 0xf7537e82, w[4], 6);
+ step(61, d, a, b, c, f, 0xbd3af235, w[11], 10);
+ step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15);
+ step(63, b, c, d, a, f, 0xeb86d391, w[9], 21);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+}